From 436df89dc73d364a88101ec7ea15b8c46bd1e389 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 20 Sep 2024 11:21:56 +1000 Subject: [PATCH] Kyber Aarch64: assembly implementations of functions Aarch64 assembly implementation of Kyber functions. SHA-3 assembly implementations when not hardware crypto. --- configure.ac | 2 +- src/include.am | 7 + wolfcrypt/src/port/arm/armv8-curve25519.S | 84 +- wolfcrypt/src/port/arm/armv8-kyber-asm.S | 14035 +++++++++++++++++++ wolfcrypt/src/port/arm/armv8-kyber-asm_c.c | 13430 ++++++++++++++++++ wolfcrypt/src/port/arm/armv8-sha3-asm.S | 245 + wolfcrypt/src/port/arm/armv8-sha3-asm_c.c | 216 + wolfcrypt/src/port/arm/armv8-sha512-asm.S | 6 +- wolfcrypt/src/sha3.c | 3 +- wolfcrypt/src/wc_kyber.c | 7 +- wolfcrypt/src/wc_kyber_poly.c | 742 +- wolfssl/wolfcrypt/sha3.h | 3 +- wolfssl/wolfcrypt/wc_kyber.h | 24 +- 13 files changed, 28716 insertions(+), 88 deletions(-) create mode 100644 wolfcrypt/src/port/arm/armv8-kyber-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-kyber-asm_c.c diff --git a/configure.ac b/configure.ac index 0841cc5342..3b0a087a24 100644 --- a/configure.ac +++ b/configure.ac @@ -2977,7 +2977,7 @@ then AM_CPPFLAGS="$AM_CPPFLAGS+sm4" fi else - AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto" + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv8.1-a+crypto" fi ;; esac diff --git a/src/include.am b/src/include.am index c3d8376a1d..881a6fe85f 100644 --- a/src/include.am +++ b/src/include.am @@ -1057,6 +1057,13 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_asm.S endif endif +if BUILD_ARMASM_NEON +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm.S +endif !BUILD_ARMASM_INLINE +endif BUILD_ARMASM_NEON endif if BUILD_DILITHIUM diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index cf20f60809..228fcf0068 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -337,8 +337,7 @@ _fe_cmov_table: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -546,8 +545,7 @@ _fe_cmov_table: stp x10, x11, [x0, #48] stp x12, x13, [x0, #64] stp x14, x15, [x0, #80] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -573,8 +571,7 @@ _fe_mul: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 - str x17, [x29, #24] - str x19, [x29, #32] + stp x17, x19, [x29, #24] stp x20, x21, [x29, #40] str x22, [x29, #56] # Multiply @@ -703,8 +700,7 @@ _fe_mul: # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] - ldr x17, [x29, #24] - ldr x19, [x29, #32] + ldp x17, x19, [x29, #24] ldp x20, x21, [x29, #40] ldr x22, [x29, #56] ldp x29, x30, [sp], #0x40 @@ -835,8 +831,7 @@ _fe_invert: #endif /* __APPLE__ */ stp x29, x30, [sp, #-176]! add x29, sp, #0 - str x17, [x29, #160] - str x20, [x29, #168] + stp x17, x20, [x29, #160] # Invert str x0, [x29, #144] str x1, [x29, #152] @@ -1694,8 +1689,7 @@ L_fe_invert8: #else bl _fe_mul #endif /* __APPLE__ */ - ldr x17, [x29, #160] - ldr x20, [x29, #168] + ldp x17, x20, [x29, #160] ldp x29, x30, [sp], #0xb0 ret #ifndef __APPLE__ @@ -1715,8 +1709,7 @@ _curve25519: #endif /* __APPLE__ */ stp x29, x30, [sp, #-288]! add x29, sp, #0 - str x17, [x29, #200] - str x19, [x29, #208] + stp x17, x19, [x29, #200] stp x20, x21, [x29, #216] stp x22, x23, [x29, #232] stp x24, x25, [x29, #248] @@ -3801,8 +3794,7 @@ L_curve25519_inv_8: stp x14, x15, [x0] stp x16, x17, [x0, #16] mov x0, xzr - ldr x17, [x29, #200] - ldr x19, [x29, #208] + ldp x17, x19, [x29, #200] ldp x20, x21, [x29, #216] ldp x22, x23, [x29, #232] ldp x24, x25, [x29, #248] @@ -3828,8 +3820,7 @@ _fe_pow22523: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #128] - str x23, [x29, #136] + stp x17, x23, [x29, #128] # pow22523 str x0, [x29, #112] str x1, [x29, #120] @@ -4619,8 +4610,7 @@ L_fe_pow22523_7: #else bl _fe_mul #endif /* __APPLE__ */ - ldr x17, [x29, #128] - ldr x23, [x29, #136] + ldp x17, x23, [x29, #128] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ @@ -4640,8 +4630,7 @@ _ge_p1p1_to_p2: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] str x22, [x29, #72] str x0, [x29, #16] @@ -5002,8 +4991,7 @@ _ge_p1p1_to_p2: # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldr x22, [x29, #72] ldp x29, x30, [sp], #0x50 @@ -5025,8 +5013,7 @@ _ge_p1p1_to_p3: #endif /* __APPLE__ */ stp x29, x30, [sp, #-112]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -5505,8 +5492,7 @@ _ge_p1p1_to_p3: # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -5530,8 +5516,7 @@ _ge_p2_dbl: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -5986,8 +5971,7 @@ _ge_p2_dbl: sbc x7, x7, xzr stp x4, x5, [x0] stp x6, x7, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -6012,8 +5996,7 @@ _ge_madd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -6503,8 +6486,7 @@ _ge_madd: stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -6529,8 +6511,7 @@ _ge_msub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -7020,8 +7001,7 @@ _ge_msub: stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -7046,8 +7026,7 @@ _ge_add: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -7663,8 +7642,7 @@ _ge_add: stp x23, x24, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -7689,8 +7667,7 @@ _ge_sub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -8321,8 +8298,7 @@ _ge_sub: stp x14, x15, [x0, #16] stp x21, x22, [x1] stp x23, x24, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -8347,8 +8323,7 @@ _sc_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 - str x17, [x29, #16] - str x19, [x29, #24] + stp x17, x19, [x29, #16] stp x20, x21, [x29, #32] stp x22, x23, [x29, #48] ldp x2, x3, [x0] @@ -8525,8 +8500,7 @@ _sc_reduce: # Store result stp x2, x3, [x0] stp x4, x5, [x0, #16] - ldr x17, [x29, #16] - ldr x19, [x29, #24] + ldp x17, x19, [x29, #16] ldp x20, x21, [x29, #32] ldp x22, x23, [x29, #48] ldp x29, x30, [sp], #0x40 @@ -8548,8 +8522,7 @@ _sc_muladd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-96]! add x29, sp, #0 - str x17, [x29, #24] - str x19, [x29, #32] + stp x17, x19, [x29, #24] stp x20, x21, [x29, #40] stp x22, x23, [x29, #56] stp x24, x25, [x29, #72] @@ -8824,8 +8797,7 @@ _sc_muladd: # Store result stp x4, x5, [x0] stp x6, x7, [x0, #16] - ldr x17, [x29, #24] - ldr x19, [x29, #32] + ldp x17, x19, [x29, #24] ldp x20, x21, [x29, #40] ldp x22, x23, [x29, #56] ldp x24, x25, [x29, #72] diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm.S b/wolfcrypt/src/port/arm/armv8-kyber-asm.S new file mode 100644 index 0000000000..cb360a7eff --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm.S @@ -0,0 +1,14035 @@ +/* armv8-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-kyber-asm.S + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifndef WOLFSSL_ARMASM_INLINE +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_q, %object + .section .rodata + .size L_kyber_aarch64_q, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_q: + .short 0xd01 + .short 0xd01 + .short 0xd01 + .short 0xd01 + .short 0xd01 + .short 0xd01 + .short 0xd01 + .short 0xd01 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_consts, %object + .section .rodata + .size L_kyber_aarch64_consts, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_consts: + .short 0xd01 + .short 0xf301 + .short 0x4ebf + .short 0x549 + .short 0x5049 + .short 0x0 + .short 0x0 + .short 0x0 +#ifdef WOLFSSL_WC_KYBER +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas, %object + .section .rodata + .size L_kyber_aarch64_zetas, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas: + .short 0x8ed + .short 0xa0b + .short 0xb9a + .short 0x714 + .short 0x5d5 + .short 0x58e + .short 0x11f + .short 0xca + .short 0xc56 + .short 0x26e + .short 0x629 + .short 0xb6 + .short 0x3c2 + .short 0x84f + .short 0x73f + .short 0x5bc + .short 0x23d + .short 0x7d4 + .short 0x108 + .short 0x17f + .short 0x9c4 + .short 0x5b2 + .short 0x6bf + .short 0xc7f + .short 0xa58 + .short 0x3f9 + .short 0x2dc + .short 0x260 + .short 0x6fb + .short 0x19b + .short 0xc34 + .short 0x6de + .short 0x4c7 + .short 0x4c7 + .short 0x4c7 + .short 0x4c7 + .short 0x28c + .short 0x28c + .short 0x28c + .short 0x28c + .short 0xad9 + .short 0xad9 + .short 0xad9 + .short 0xad9 + .short 0x3f7 + .short 0x3f7 + .short 0x3f7 + .short 0x3f7 + .short 0x7f4 + .short 0x7f4 + .short 0x7f4 + .short 0x7f4 + .short 0x5d3 + .short 0x5d3 + .short 0x5d3 + .short 0x5d3 + .short 0xbe7 + .short 0xbe7 + .short 0xbe7 + .short 0xbe7 + .short 0x6f9 + .short 0x6f9 + .short 0x6f9 + .short 0x6f9 + .short 0x204 + .short 0x204 + .short 0x204 + .short 0x204 + .short 0xcf9 + .short 0xcf9 + .short 0xcf9 + .short 0xcf9 + .short 0xbc1 + .short 0xbc1 + .short 0xbc1 + .short 0xbc1 + .short 0xa67 + .short 0xa67 + .short 0xa67 + .short 0xa67 + .short 0x6af + .short 0x6af + .short 0x6af + .short 0x6af + .short 0x877 + .short 0x877 + .short 0x877 + .short 0x877 + .short 0x7e + .short 0x7e + .short 0x7e + .short 0x7e + .short 0x5bd + .short 0x5bd + .short 0x5bd + .short 0x5bd + .short 0x9ac + .short 0x9ac + .short 0x9ac + .short 0x9ac + .short 0xca7 + .short 0xca7 + .short 0xca7 + .short 0xca7 + .short 0xbf2 + .short 0xbf2 + .short 0xbf2 + .short 0xbf2 + .short 0x33e + .short 0x33e + .short 0x33e + .short 0x33e + .short 0x6b + .short 0x6b + .short 0x6b + .short 0x6b + .short 0x774 + .short 0x774 + .short 0x774 + .short 0x774 + .short 0xc0a + .short 0xc0a + .short 0xc0a + .short 0xc0a + .short 0x94a + .short 0x94a + .short 0x94a + .short 0x94a + .short 0xb73 + .short 0xb73 + .short 0xb73 + .short 0xb73 + .short 0x3c1 + .short 0x3c1 + .short 0x3c1 + .short 0x3c1 + .short 0x71d + .short 0x71d + .short 0x71d + .short 0x71d + .short 0xa2c + .short 0xa2c + .short 0xa2c + .short 0xa2c + .short 0x1c0 + .short 0x1c0 + .short 0x1c0 + .short 0x1c0 + .short 0x8d8 + .short 0x8d8 + .short 0x8d8 + .short 0x8d8 + .short 0x2a5 + .short 0x2a5 + .short 0x2a5 + .short 0x2a5 + .short 0x806 + .short 0x806 + .short 0x806 + .short 0x806 + .short 0x8b2 + .short 0x8b2 + .short 0x1ae + .short 0x1ae + .short 0x22b + .short 0x22b + .short 0x34b + .short 0x34b + .short 0x81e + .short 0x81e + .short 0x367 + .short 0x367 + .short 0x60e + .short 0x60e + .short 0x69 + .short 0x69 + .short 0x1a6 + .short 0x1a6 + .short 0x24b + .short 0x24b + .short 0xb1 + .short 0xb1 + .short 0xc16 + .short 0xc16 + .short 0xbde + .short 0xbde + .short 0xb35 + .short 0xb35 + .short 0x626 + .short 0x626 + .short 0x675 + .short 0x675 + .short 0xc0b + .short 0xc0b + .short 0x30a + .short 0x30a + .short 0x487 + .short 0x487 + .short 0xc6e + .short 0xc6e + .short 0x9f8 + .short 0x9f8 + .short 0x5cb + .short 0x5cb + .short 0xaa7 + .short 0xaa7 + .short 0x45f + .short 0x45f + .short 0x6cb + .short 0x6cb + .short 0x284 + .short 0x284 + .short 0x999 + .short 0x999 + .short 0x15d + .short 0x15d + .short 0x1a2 + .short 0x1a2 + .short 0x149 + .short 0x149 + .short 0xc65 + .short 0xc65 + .short 0xcb6 + .short 0xcb6 + .short 0x331 + .short 0x331 + .short 0x449 + .short 0x449 + .short 0x25b + .short 0x25b + .short 0x262 + .short 0x262 + .short 0x52a + .short 0x52a + .short 0x7fc + .short 0x7fc + .short 0x748 + .short 0x748 + .short 0x180 + .short 0x180 + .short 0x842 + .short 0x842 + .short 0xc79 + .short 0xc79 + .short 0x4c2 + .short 0x4c2 + .short 0x7ca + .short 0x7ca + .short 0x997 + .short 0x997 + .short 0xdc + .short 0xdc + .short 0x85e + .short 0x85e + .short 0x686 + .short 0x686 + .short 0x860 + .short 0x860 + .short 0x707 + .short 0x707 + .short 0x803 + .short 0x803 + .short 0x31a + .short 0x31a + .short 0x71b + .short 0x71b + .short 0x9ab + .short 0x9ab + .short 0x99b + .short 0x99b + .short 0x1de + .short 0x1de + .short 0xc95 + .short 0xc95 + .short 0xbcd + .short 0xbcd + .short 0x3e4 + .short 0x3e4 + .short 0x3df + .short 0x3df + .short 0x3be + .short 0x3be + .short 0x74d + .short 0x74d + .short 0x5f2 + .short 0x5f2 + .short 0x65c + .short 0x65c +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_qinv, %object + .section .rodata + .size L_kyber_aarch64_zetas_qinv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_qinv: + .short 0xffed + .short 0x7b0b + .short 0x399a + .short 0x314 + .short 0x34d5 + .short 0xcf8e + .short 0x6e1f + .short 0xbeca + .short 0xae56 + .short 0x6c6e + .short 0xf129 + .short 0xc2b6 + .short 0x29c2 + .short 0x54f + .short 0xd43f + .short 0x79bc + .short 0xe93d + .short 0x43d4 + .short 0x9908 + .short 0x8e7f + .short 0x15c4 + .short 0xfbb2 + .short 0x53bf + .short 0x997f + .short 0x9258 + .short 0x5ef9 + .short 0xd6dc + .short 0x2260 + .short 0x47fb + .short 0x229b + .short 0x6834 + .short 0xc0de + .short 0xe9c7 + .short 0xe9c7 + .short 0xe9c7 + .short 0xe9c7 + .short 0xe68c + .short 0xe68c + .short 0xe68c + .short 0xe68c + .short 0x5d9 + .short 0x5d9 + .short 0x5d9 + .short 0x5d9 + .short 0x78f7 + .short 0x78f7 + .short 0x78f7 + .short 0x78f7 + .short 0xa3f4 + .short 0xa3f4 + .short 0xa3f4 + .short 0xa3f4 + .short 0x4ed3 + .short 0x4ed3 + .short 0x4ed3 + .short 0x4ed3 + .short 0x50e7 + .short 0x50e7 + .short 0x50e7 + .short 0x50e7 + .short 0x61f9 + .short 0x61f9 + .short 0x61f9 + .short 0x61f9 + .short 0xce04 + .short 0xce04 + .short 0xce04 + .short 0xce04 + .short 0x67f9 + .short 0x67f9 + .short 0x67f9 + .short 0x67f9 + .short 0x3ec1 + .short 0x3ec1 + .short 0x3ec1 + .short 0x3ec1 + .short 0xcf67 + .short 0xcf67 + .short 0xcf67 + .short 0xcf67 + .short 0x23af + .short 0x23af + .short 0x23af + .short 0x23af + .short 0xfd77 + .short 0xfd77 + .short 0xfd77 + .short 0xfd77 + .short 0x9a7e + .short 0x9a7e + .short 0x9a7e + .short 0x9a7e + .short 0x6cbd + .short 0x6cbd + .short 0x6cbd + .short 0x6cbd + .short 0x4dac + .short 0x4dac + .short 0x4dac + .short 0x4dac + .short 0x91a7 + .short 0x91a7 + .short 0x91a7 + .short 0x91a7 + .short 0xc1f2 + .short 0xc1f2 + .short 0xc1f2 + .short 0xc1f2 + .short 0xdd3e + .short 0xdd3e + .short 0xdd3e + .short 0xdd3e + .short 0x916b + .short 0x916b + .short 0x916b + .short 0x916b + .short 0x2374 + .short 0x2374 + .short 0x2374 + .short 0x2374 + .short 0x8a0a + .short 0x8a0a + .short 0x8a0a + .short 0x8a0a + .short 0x474a + .short 0x474a + .short 0x474a + .short 0x474a + .short 0x3473 + .short 0x3473 + .short 0x3473 + .short 0x3473 + .short 0x36c1 + .short 0x36c1 + .short 0x36c1 + .short 0x36c1 + .short 0x8e1d + .short 0x8e1d + .short 0x8e1d + .short 0x8e1d + .short 0xce2c + .short 0xce2c + .short 0xce2c + .short 0xce2c + .short 0x41c0 + .short 0x41c0 + .short 0x41c0 + .short 0x41c0 + .short 0x10d8 + .short 0x10d8 + .short 0x10d8 + .short 0x10d8 + .short 0xa1a5 + .short 0xa1a5 + .short 0xa1a5 + .short 0xa1a5 + .short 0xba06 + .short 0xba06 + .short 0xba06 + .short 0xba06 + .short 0xfeb2 + .short 0xfeb2 + .short 0x2bae + .short 0x2bae + .short 0xd32b + .short 0xd32b + .short 0x344b + .short 0x344b + .short 0x821e + .short 0x821e + .short 0xc867 + .short 0xc867 + .short 0x500e + .short 0x500e + .short 0xab69 + .short 0xab69 + .short 0x93a6 + .short 0x93a6 + .short 0x334b + .short 0x334b + .short 0x3b1 + .short 0x3b1 + .short 0xee16 + .short 0xee16 + .short 0xc5de + .short 0xc5de + .short 0x5a35 + .short 0x5a35 + .short 0x1826 + .short 0x1826 + .short 0x1575 + .short 0x1575 + .short 0x7d0b + .short 0x7d0b + .short 0x810a + .short 0x810a + .short 0x2987 + .short 0x2987 + .short 0x766e + .short 0x766e + .short 0x71f8 + .short 0x71f8 + .short 0xb6cb + .short 0xb6cb + .short 0x8fa7 + .short 0x8fa7 + .short 0x315f + .short 0x315f + .short 0xb7cb + .short 0xb7cb + .short 0x4e84 + .short 0x4e84 + .short 0x4499 + .short 0x4499 + .short 0x485d + .short 0x485d + .short 0xc7a2 + .short 0xc7a2 + .short 0x4c49 + .short 0x4c49 + .short 0xeb65 + .short 0xeb65 + .short 0xceb6 + .short 0xceb6 + .short 0x8631 + .short 0x8631 + .short 0x4f49 + .short 0x4f49 + .short 0x635b + .short 0x635b + .short 0x862 + .short 0x862 + .short 0xe32a + .short 0xe32a + .short 0x3bfc + .short 0x3bfc + .short 0x5f48 + .short 0x5f48 + .short 0x8180 + .short 0x8180 + .short 0xae42 + .short 0xae42 + .short 0xe779 + .short 0xe779 + .short 0x2ac2 + .short 0x2ac2 + .short 0xc5ca + .short 0xc5ca + .short 0x5e97 + .short 0x5e97 + .short 0xd4dc + .short 0xd4dc + .short 0x425e + .short 0x425e + .short 0x3886 + .short 0x3886 + .short 0x2860 + .short 0x2860 + .short 0xac07 + .short 0xac07 + .short 0xe103 + .short 0xe103 + .short 0xb11a + .short 0xb11a + .short 0xa81b + .short 0xa81b + .short 0x5aab + .short 0x5aab + .short 0x2a9b + .short 0x2a9b + .short 0xbbde + .short 0xbbde + .short 0x7b95 + .short 0x7b95 + .short 0xa2cd + .short 0xa2cd + .short 0x6fe4 + .short 0x6fe4 + .short 0xb0df + .short 0xb0df + .short 0x5dbe + .short 0x5dbe + .short 0x1e4d + .short 0x1e4d + .short 0xbbf2 + .short 0xbbf2 + .short 0x5a5c + .short 0x5a5c +#ifndef __APPLE__ +.text +.globl kyber_ntt +.type kyber_ntt,@function +.align 2 +kyber_ntt: +#else +.section __TEXT,__text +.globl _kyber_ntt +.p2align 2 +_kyber_ntt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas + add x2, x2, :lo12:L_kyber_aarch64_zetas +#else + adrp x2, L_kyber_aarch64_zetas@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv +#else + adrp x3, L_kyber_aarch64_zetas_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q4, [x4] + ldr q5, [x0] + ldr q6, [x0, #32] + ldr q7, [x0, #64] + ldr q8, [x0, #96] + ldr q9, [x0, #128] + ldr q10, [x0, #160] + ldr q11, [x0, #192] + ldr q12, [x0, #224] + ldr q13, [x1] + ldr q14, [x1, #32] + ldr q15, [x1, #64] + ldr q16, [x1, #96] + ldr q17, [x1, #128] + ldr q18, [x1, #160] + ldr q19, [x1, #192] + ldr q20, [x1, #224] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0] + str q6, [x0, #32] + str q7, [x0, #64] + str q8, [x0, #96] + str q9, [x0, #128] + str q10, [x0, #160] + str q11, [x0, #192] + str q12, [x0, #224] + str q13, [x1] + str q14, [x1, #32] + str q15, [x1, #64] + str q16, [x1, #96] + str q17, [x1, #128] + str q18, [x1, #160] + str q19, [x1, #192] + str q20, [x1, #224] + ldr q5, [x0, #16] + ldr q6, [x0, #48] + ldr q7, [x0, #80] + ldr q8, [x0, #112] + ldr q9, [x0, #144] + ldr q10, [x0, #176] + ldr q11, [x0, #208] + ldr q12, [x0, #240] + ldr q13, [x1, #16] + ldr q14, [x1, #48] + ldr q15, [x1, #80] + ldr q16, [x1, #112] + ldr q17, [x1, #144] + ldr q18, [x1, #176] + ldr q19, [x1, #208] + ldr q20, [x1, #240] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0, #16] + str q6, [x0, #48] + str q7, [x0, #80] + str q8, [x0, #112] + str q9, [x0, #144] + str q10, [x0, #176] + str q11, [x0, #208] + str q12, [x0, #240] + str q13, [x1, #16] + str q14, [x1, #48] + str q15, [x1, #80] + str q16, [x1, #112] + str q17, [x1, #144] + str q18, [x1, #176] + str q19, [x1, #208] + str q20, [x1, #240] + ldp q5, q6, [x0] + ldp q7, q8, [x0, #32] + ldp q9, q10, [x0, #64] + ldp q11, q12, [x0, #96] + ldp q13, q14, [x0, #128] + ldp q15, q16, [x0, #160] + ldp q17, q18, [x0, #192] + ldp q19, q20, [x0, #224] + ldr q0, [x2, #32] + ldr q1, [x3, #32] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #64] + ldr q2, [x2, #80] + ldr q1, [x3, #64] + ldr q3, [x3, #80] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #96] + ldr q2, [x2, #112] + ldr q1, [x3, #96] + ldr q3, [x3, #112] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #128] + ldr q2, [x2, #144] + ldr q1, [x3, #128] + ldr q3, [x3, #144] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #160] + ldr q2, [x2, #176] + ldr q1, [x3, #160] + ldr q3, [x3, #176] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #320] + ldr q2, [x2, #336] + ldr q1, [x3, #320] + ldr q3, [x3, #336] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #352] + ldr q2, [x2, #368] + ldr q1, [x3, #352] + ldr q3, [x3, #368] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q2, [x2, #400] + ldr q1, [x3, #384] + ldr q3, [x3, #400] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #416] + ldr q2, [x2, #432] + ldr q1, [x3, #416] + ldr q3, [x3, #432] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x0] + stp q7, q8, [x0, #32] + stp q9, q10, [x0, #64] + stp q11, q12, [x0, #96] + stp q13, q14, [x0, #128] + stp q15, q16, [x0, #160] + stp q17, q18, [x0, #192] + stp q19, q20, [x0, #224] + ldp q5, q6, [x1] + ldp q7, q8, [x1, #32] + ldp q9, q10, [x1, #64] + ldp q11, q12, [x1, #96] + ldp q13, q14, [x1, #128] + ldp q15, q16, [x1, #160] + ldp q17, q18, [x1, #192] + ldp q19, q20, [x1, #224] + ldr q0, [x2, #48] + ldr q1, [x3, #48] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #192] + ldr q2, [x2, #208] + ldr q1, [x3, #192] + ldr q3, [x3, #208] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #224] + ldr q2, [x2, #240] + ldr q1, [x3, #224] + ldr q3, [x3, #240] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q2, [x2, #272] + ldr q1, [x3, #256] + ldr q3, [x3, #272] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #288] + ldr q2, [x2, #304] + ldr q1, [x3, #288] + ldr q3, [x3, #304] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #448] + ldr q2, [x2, #464] + ldr q1, [x3, #448] + ldr q3, [x3, #464] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #480] + ldr q2, [x2, #496] + ldr q1, [x3, #480] + ldr q3, [x3, #496] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x2, #528] + ldr q1, [x3, #512] + ldr q3, [x3, #528] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #544] + ldr q2, [x2, #560] + ldr q1, [x3, #544] + ldr q3, [x3, #560] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x1] + stp q7, q8, [x1, #32] + stp q9, q10, [x1, #64] + stp q11, q12, [x1, #96] + stp q13, q14, [x1, #128] + stp q15, q16, [x1, #160] + stp q17, q18, [x1, #192] + stp q19, q20, [x1, #224] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_ntt,.-kyber_ntt +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_inv, %object + .section .rodata + .size L_kyber_aarch64_zetas_inv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_inv: + .short 0x6a5 + .short 0x6a5 + .short 0x70f + .short 0x70f + .short 0x5b4 + .short 0x5b4 + .short 0x943 + .short 0x943 + .short 0x922 + .short 0x922 + .short 0x91d + .short 0x91d + .short 0x134 + .short 0x134 + .short 0x6c + .short 0x6c + .short 0xb23 + .short 0xb23 + .short 0x366 + .short 0x366 + .short 0x356 + .short 0x356 + .short 0x5e6 + .short 0x5e6 + .short 0x9e7 + .short 0x9e7 + .short 0x4fe + .short 0x4fe + .short 0x5fa + .short 0x5fa + .short 0x4a1 + .short 0x4a1 + .short 0x67b + .short 0x67b + .short 0x4a3 + .short 0x4a3 + .short 0xc25 + .short 0xc25 + .short 0x36a + .short 0x36a + .short 0x537 + .short 0x537 + .short 0x83f + .short 0x83f + .short 0x88 + .short 0x88 + .short 0x4bf + .short 0x4bf + .short 0xb81 + .short 0xb81 + .short 0x5b9 + .short 0x5b9 + .short 0x505 + .short 0x505 + .short 0x7d7 + .short 0x7d7 + .short 0xa9f + .short 0xa9f + .short 0xaa6 + .short 0xaa6 + .short 0x8b8 + .short 0x8b8 + .short 0x9d0 + .short 0x9d0 + .short 0x4b + .short 0x4b + .short 0x9c + .short 0x9c + .short 0xbb8 + .short 0xbb8 + .short 0xb5f + .short 0xb5f + .short 0xba4 + .short 0xba4 + .short 0x368 + .short 0x368 + .short 0xa7d + .short 0xa7d + .short 0x636 + .short 0x636 + .short 0x8a2 + .short 0x8a2 + .short 0x25a + .short 0x25a + .short 0x736 + .short 0x736 + .short 0x309 + .short 0x309 + .short 0x93 + .short 0x93 + .short 0x87a + .short 0x87a + .short 0x9f7 + .short 0x9f7 + .short 0xf6 + .short 0xf6 + .short 0x68c + .short 0x68c + .short 0x6db + .short 0x6db + .short 0x1cc + .short 0x1cc + .short 0x123 + .short 0x123 + .short 0xeb + .short 0xeb + .short 0xc50 + .short 0xc50 + .short 0xab6 + .short 0xab6 + .short 0xb5b + .short 0xb5b + .short 0xc98 + .short 0xc98 + .short 0x6f3 + .short 0x6f3 + .short 0x99a + .short 0x99a + .short 0x4e3 + .short 0x4e3 + .short 0x9b6 + .short 0x9b6 + .short 0xad6 + .short 0xad6 + .short 0xb53 + .short 0xb53 + .short 0x44f + .short 0x44f + .short 0x4fb + .short 0x4fb + .short 0x4fb + .short 0x4fb + .short 0xa5c + .short 0xa5c + .short 0xa5c + .short 0xa5c + .short 0x429 + .short 0x429 + .short 0x429 + .short 0x429 + .short 0xb41 + .short 0xb41 + .short 0xb41 + .short 0xb41 + .short 0x2d5 + .short 0x2d5 + .short 0x2d5 + .short 0x2d5 + .short 0x5e4 + .short 0x5e4 + .short 0x5e4 + .short 0x5e4 + .short 0x940 + .short 0x940 + .short 0x940 + .short 0x940 + .short 0x18e + .short 0x18e + .short 0x18e + .short 0x18e + .short 0x3b7 + .short 0x3b7 + .short 0x3b7 + .short 0x3b7 + .short 0xf7 + .short 0xf7 + .short 0xf7 + .short 0xf7 + .short 0x58d + .short 0x58d + .short 0x58d + .short 0x58d + .short 0xc96 + .short 0xc96 + .short 0xc96 + .short 0xc96 + .short 0x9c3 + .short 0x9c3 + .short 0x9c3 + .short 0x9c3 + .short 0x10f + .short 0x10f + .short 0x10f + .short 0x10f + .short 0x5a + .short 0x5a + .short 0x5a + .short 0x5a + .short 0x355 + .short 0x355 + .short 0x355 + .short 0x355 + .short 0x744 + .short 0x744 + .short 0x744 + .short 0x744 + .short 0xc83 + .short 0xc83 + .short 0xc83 + .short 0xc83 + .short 0x48a + .short 0x48a + .short 0x48a + .short 0x48a + .short 0x652 + .short 0x652 + .short 0x652 + .short 0x652 + .short 0x29a + .short 0x29a + .short 0x29a + .short 0x29a + .short 0x140 + .short 0x140 + .short 0x140 + .short 0x140 + .short 0x8 + .short 0x8 + .short 0x8 + .short 0x8 + .short 0xafd + .short 0xafd + .short 0xafd + .short 0xafd + .short 0x608 + .short 0x608 + .short 0x608 + .short 0x608 + .short 0x11a + .short 0x11a + .short 0x11a + .short 0x11a + .short 0x72e + .short 0x72e + .short 0x72e + .short 0x72e + .short 0x50d + .short 0x50d + .short 0x50d + .short 0x50d + .short 0x90a + .short 0x90a + .short 0x90a + .short 0x90a + .short 0x228 + .short 0x228 + .short 0x228 + .short 0x228 + .short 0xa75 + .short 0xa75 + .short 0xa75 + .short 0xa75 + .short 0x83a + .short 0x83a + .short 0x83a + .short 0x83a + .short 0x623 + .short 0xcd + .short 0xb66 + .short 0x606 + .short 0xaa1 + .short 0xa25 + .short 0x908 + .short 0x2a9 + .short 0x82 + .short 0x642 + .short 0x74f + .short 0x33d + .short 0xb82 + .short 0xbf9 + .short 0x52d + .short 0xac4 + .short 0x745 + .short 0x5c2 + .short 0x4b2 + .short 0x93f + .short 0xc4b + .short 0x6d8 + .short 0xa93 + .short 0xab + .short 0xc37 + .short 0xbe2 + .short 0x773 + .short 0x72c + .short 0x5ed + .short 0x167 + .short 0x2f6 + .short 0x5a1 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_inv_qinv, %object + .section .rodata + .size L_kyber_aarch64_zetas_inv_qinv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_inv_qinv: + .short 0xa5a5 + .short 0xa5a5 + .short 0x440f + .short 0x440f + .short 0xe1b4 + .short 0xe1b4 + .short 0xa243 + .short 0xa243 + .short 0x4f22 + .short 0x4f22 + .short 0x901d + .short 0x901d + .short 0x5d34 + .short 0x5d34 + .short 0x846c + .short 0x846c + .short 0x4423 + .short 0x4423 + .short 0xd566 + .short 0xd566 + .short 0xa556 + .short 0xa556 + .short 0x57e6 + .short 0x57e6 + .short 0x4ee7 + .short 0x4ee7 + .short 0x1efe + .short 0x1efe + .short 0x53fa + .short 0x53fa + .short 0xd7a1 + .short 0xd7a1 + .short 0xc77b + .short 0xc77b + .short 0xbda3 + .short 0xbda3 + .short 0x2b25 + .short 0x2b25 + .short 0xa16a + .short 0xa16a + .short 0x3a37 + .short 0x3a37 + .short 0xd53f + .short 0xd53f + .short 0x1888 + .short 0x1888 + .short 0x51bf + .short 0x51bf + .short 0x7e81 + .short 0x7e81 + .short 0xa0b9 + .short 0xa0b9 + .short 0xc405 + .short 0xc405 + .short 0x1cd7 + .short 0x1cd7 + .short 0xf79f + .short 0xf79f + .short 0x9ca6 + .short 0x9ca6 + .short 0xb0b8 + .short 0xb0b8 + .short 0x79d0 + .short 0x79d0 + .short 0x314b + .short 0x314b + .short 0x149c + .short 0x149c + .short 0xb3b8 + .short 0xb3b8 + .short 0x385f + .short 0x385f + .short 0xb7a4 + .short 0xb7a4 + .short 0xbb68 + .short 0xbb68 + .short 0xb17d + .short 0xb17d + .short 0x4836 + .short 0x4836 + .short 0xcea2 + .short 0xcea2 + .short 0x705a + .short 0x705a + .short 0x4936 + .short 0x4936 + .short 0x8e09 + .short 0x8e09 + .short 0x8993 + .short 0x8993 + .short 0xd67a + .short 0xd67a + .short 0x7ef7 + .short 0x7ef7 + .short 0x82f6 + .short 0x82f6 + .short 0xea8c + .short 0xea8c + .short 0xe7db + .short 0xe7db + .short 0xa5cc + .short 0xa5cc + .short 0x3a23 + .short 0x3a23 + .short 0x11eb + .short 0x11eb + .short 0xfc50 + .short 0xfc50 + .short 0xccb6 + .short 0xccb6 + .short 0x6c5b + .short 0x6c5b + .short 0x5498 + .short 0x5498 + .short 0xaff3 + .short 0xaff3 + .short 0x379a + .short 0x379a + .short 0x7de3 + .short 0x7de3 + .short 0xcbb6 + .short 0xcbb6 + .short 0x2cd6 + .short 0x2cd6 + .short 0xd453 + .short 0xd453 + .short 0x14f + .short 0x14f + .short 0x45fb + .short 0x45fb + .short 0x45fb + .short 0x45fb + .short 0x5e5c + .short 0x5e5c + .short 0x5e5c + .short 0x5e5c + .short 0xef29 + .short 0xef29 + .short 0xef29 + .short 0xef29 + .short 0xbe41 + .short 0xbe41 + .short 0xbe41 + .short 0xbe41 + .short 0x31d5 + .short 0x31d5 + .short 0x31d5 + .short 0x31d5 + .short 0x71e4 + .short 0x71e4 + .short 0x71e4 + .short 0x71e4 + .short 0xc940 + .short 0xc940 + .short 0xc940 + .short 0xc940 + .short 0xcb8e + .short 0xcb8e + .short 0xcb8e + .short 0xcb8e + .short 0xb8b7 + .short 0xb8b7 + .short 0xb8b7 + .short 0xb8b7 + .short 0x75f7 + .short 0x75f7 + .short 0x75f7 + .short 0x75f7 + .short 0xdc8d + .short 0xdc8d + .short 0xdc8d + .short 0xdc8d + .short 0x6e96 + .short 0x6e96 + .short 0x6e96 + .short 0x6e96 + .short 0x22c3 + .short 0x22c3 + .short 0x22c3 + .short 0x22c3 + .short 0x3e0f + .short 0x3e0f + .short 0x3e0f + .short 0x3e0f + .short 0x6e5a + .short 0x6e5a + .short 0x6e5a + .short 0x6e5a + .short 0xb255 + .short 0xb255 + .short 0xb255 + .short 0xb255 + .short 0x9344 + .short 0x9344 + .short 0x9344 + .short 0x9344 + .short 0x6583 + .short 0x6583 + .short 0x6583 + .short 0x6583 + .short 0x28a + .short 0x28a + .short 0x28a + .short 0x28a + .short 0xdc52 + .short 0xdc52 + .short 0xdc52 + .short 0xdc52 + .short 0x309a + .short 0x309a + .short 0x309a + .short 0x309a + .short 0xc140 + .short 0xc140 + .short 0xc140 + .short 0xc140 + .short 0x9808 + .short 0x9808 + .short 0x9808 + .short 0x9808 + .short 0x31fd + .short 0x31fd + .short 0x31fd + .short 0x31fd + .short 0x9e08 + .short 0x9e08 + .short 0x9e08 + .short 0x9e08 + .short 0xaf1a + .short 0xaf1a + .short 0xaf1a + .short 0xaf1a + .short 0xb12e + .short 0xb12e + .short 0xb12e + .short 0xb12e + .short 0x5c0d + .short 0x5c0d + .short 0x5c0d + .short 0x5c0d + .short 0x870a + .short 0x870a + .short 0x870a + .short 0x870a + .short 0xfa28 + .short 0xfa28 + .short 0xfa28 + .short 0xfa28 + .short 0x1975 + .short 0x1975 + .short 0x1975 + .short 0x1975 + .short 0x163a + .short 0x163a + .short 0x163a + .short 0x163a + .short 0x3f23 + .short 0x97cd + .short 0xdd66 + .short 0xb806 + .short 0xdda1 + .short 0x2925 + .short 0xa108 + .short 0x6da9 + .short 0x6682 + .short 0xac42 + .short 0x44f + .short 0xea3d + .short 0x7182 + .short 0x66f9 + .short 0xbc2d + .short 0x16c4 + .short 0x8645 + .short 0x2bc2 + .short 0xfab2 + .short 0xd63f + .short 0x3d4b + .short 0xed8 + .short 0x9393 + .short 0x51ab + .short 0x4137 + .short 0x91e2 + .short 0x3073 + .short 0xcb2c + .short 0xfced + .short 0xc667 + .short 0x84f6 + .short 0xd8a1 +#ifndef __APPLE__ +.text +.globl kyber_invntt +.type kyber_invntt,@function +.align 2 +kyber_invntt: +#else +.section __TEXT,__text +.globl _kyber_invntt +.p2align 2 +_kyber_invntt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas_inv + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv +#else + adrp x2, L_kyber_aarch64_zetas_inv@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_inv_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv +#else + adrp x3, L_kyber_aarch64_zetas_inv_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q8, [x4] + ldp q9, q10, [x0] + ldp q11, q12, [x0, #32] + ldp q13, q14, [x0, #64] + ldp q15, q16, [x0, #96] + ldp q17, q18, [x0, #128] + ldp q19, q20, [x0, #160] + ldp q21, q22, [x0, #192] + ldp q23, q24, [x0, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2] + ldr q1, [x2, #16] + ldr q2, [x3] + ldr q3, [x3, #16] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #32] + ldr q1, [x2, #48] + ldr q2, [x3, #32] + ldr q3, [x3, #48] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #64] + ldr q1, [x2, #80] + ldr q2, [x3, #64] + ldr q3, [x3, #80] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #96] + ldr q1, [x2, #112] + ldr q2, [x3, #96] + ldr q3, [x3, #112] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q1, [x2, #272] + ldr q2, [x3, #256] + ldr q3, [x3, #272] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #288] + ldr q1, [x2, #304] + ldr q2, [x3, #288] + ldr q3, [x3, #304] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #320] + ldr q1, [x2, #336] + ldr q2, [x3, #320] + ldr q3, [x3, #336] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #352] + ldr q1, [x2, #368] + ldr q2, [x3, #352] + ldr q3, [x3, #368] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x3, #512] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x0] + stp q11, q12, [x0, #32] + stp q13, q14, [x0, #64] + stp q15, q16, [x0, #96] + stp q17, q18, [x0, #128] + stp q19, q20, [x0, #160] + stp q21, q22, [x0, #192] + stp q23, q24, [x0, #224] + ldp q9, q10, [x1] + ldp q11, q12, [x1, #32] + ldp q13, q14, [x1, #64] + ldp q15, q16, [x1, #96] + ldp q17, q18, [x1, #128] + ldp q19, q20, [x1, #160] + ldp q21, q22, [x1, #192] + ldp q23, q24, [x1, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2, #128] + ldr q1, [x2, #144] + ldr q2, [x3, #128] + ldr q3, [x3, #144] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #160] + ldr q1, [x2, #176] + ldr q2, [x3, #160] + ldr q3, [x3, #176] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #192] + ldr q1, [x2, #208] + ldr q2, [x3, #192] + ldr q3, [x3, #208] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #224] + ldr q1, [x2, #240] + ldr q2, [x3, #224] + ldr q3, [x3, #240] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q1, [x2, #400] + ldr q2, [x3, #384] + ldr q3, [x3, #400] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #416] + ldr q1, [x2, #432] + ldr q2, [x3, #416] + ldr q3, [x3, #432] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #448] + ldr q1, [x2, #464] + ldr q2, [x3, #448] + ldr q3, [x3, #464] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #480] + ldr q1, [x2, #496] + ldr q2, [x3, #480] + ldr q3, [x3, #496] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #528] + ldr q2, [x3, #528] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x1] + stp q11, q12, [x1, #32] + stp q13, q14, [x1, #64] + stp q15, q16, [x1, #96] + stp q17, q18, [x1, #128] + stp q19, q20, [x1, #160] + stp q21, q22, [x1, #192] + stp q23, q24, [x1, #224] + ldr q4, [x2, #544] + ldr q5, [x2, #560] + ldr q6, [x3, #544] + ldr q7, [x3, #560] + ldr q9, [x0] + ldr q10, [x0, #32] + ldr q11, [x0, #64] + ldr q12, [x0, #96] + ldr q13, [x0, #128] + ldr q14, [x0, #160] + ldr q15, [x0, #192] + ldr q16, [x0, #224] + ldr q17, [x1] + ldr q18, [x1, #32] + ldr q19, [x1, #64] + ldr q20, [x1, #96] + ldr q21, [x1, #128] + ldr q22, [x1, #160] + ldr q23, [x1, #192] + ldr q24, [x1, #224] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v26.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v26.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v26.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v26.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v26.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v26.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0] + str q10, [x0, #32] + str q11, [x0, #64] + str q12, [x0, #96] + str q13, [x0, #128] + str q14, [x0, #160] + str q15, [x0, #192] + str q16, [x0, #224] + str q17, [x1] + str q18, [x1, #32] + str q19, [x1, #64] + str q20, [x1, #96] + str q21, [x1, #128] + str q22, [x1, #160] + str q23, [x1, #192] + str q24, [x1, #224] + ldr q9, [x0, #16] + ldr q10, [x0, #48] + ldr q11, [x0, #80] + ldr q12, [x0, #112] + ldr q13, [x0, #144] + ldr q14, [x0, #176] + ldr q15, [x0, #208] + ldr q16, [x0, #240] + ldr q17, [x1, #16] + ldr q18, [x1, #48] + ldr q19, [x1, #80] + ldr q20, [x1, #112] + ldr q21, [x1, #144] + ldr q22, [x1, #176] + ldr q23, [x1, #208] + ldr q24, [x1, #240] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v26.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v26.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v26.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v26.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v26.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v26.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0, #16] + str q10, [x0, #48] + str q11, [x0, #80] + str q12, [x0, #112] + str q13, [x0, #144] + str q14, [x0, #176] + str q15, [x0, #208] + str q16, [x0, #240] + str q17, [x1, #16] + str q18, [x1, #48] + str q19, [x1, #80] + str q20, [x1, #112] + str q21, [x1, #144] + str q22, [x1, #176] + str q23, [x1, #208] + str q24, [x1, #240] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_invntt,.-kyber_invntt +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_mul, %object + .section .rodata + .size L_kyber_aarch64_zetas_mul, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_mul: + .short 0x8b2 + .short 0xf74e + .short 0x1ae + .short 0xfe52 + .short 0x22b + .short 0xfdd5 + .short 0x34b + .short 0xfcb5 + .short 0x81e + .short 0xf7e2 + .short 0x367 + .short 0xfc99 + .short 0x60e + .short 0xf9f2 + .short 0x69 + .short 0xff97 + .short 0x1a6 + .short 0xfe5a + .short 0x24b + .short 0xfdb5 + .short 0xb1 + .short 0xff4f + .short 0xc16 + .short 0xf3ea + .short 0xbde + .short 0xf422 + .short 0xb35 + .short 0xf4cb + .short 0x626 + .short 0xf9da + .short 0x675 + .short 0xf98b + .short 0xc0b + .short 0xf3f5 + .short 0x30a + .short 0xfcf6 + .short 0x487 + .short 0xfb79 + .short 0xc6e + .short 0xf392 + .short 0x9f8 + .short 0xf608 + .short 0x5cb + .short 0xfa35 + .short 0xaa7 + .short 0xf559 + .short 0x45f + .short 0xfba1 + .short 0x6cb + .short 0xf935 + .short 0x284 + .short 0xfd7c + .short 0x999 + .short 0xf667 + .short 0x15d + .short 0xfea3 + .short 0x1a2 + .short 0xfe5e + .short 0x149 + .short 0xfeb7 + .short 0xc65 + .short 0xf39b + .short 0xcb6 + .short 0xf34a + .short 0x331 + .short 0xfccf + .short 0x449 + .short 0xfbb7 + .short 0x25b + .short 0xfda5 + .short 0x262 + .short 0xfd9e + .short 0x52a + .short 0xfad6 + .short 0x7fc + .short 0xf804 + .short 0x748 + .short 0xf8b8 + .short 0x180 + .short 0xfe80 + .short 0x842 + .short 0xf7be + .short 0xc79 + .short 0xf387 + .short 0x4c2 + .short 0xfb3e + .short 0x7ca + .short 0xf836 + .short 0x997 + .short 0xf669 + .short 0xdc + .short 0xff24 + .short 0x85e + .short 0xf7a2 + .short 0x686 + .short 0xf97a + .short 0x860 + .short 0xf7a0 + .short 0x707 + .short 0xf8f9 + .short 0x803 + .short 0xf7fd + .short 0x31a + .short 0xfce6 + .short 0x71b + .short 0xf8e5 + .short 0x9ab + .short 0xf655 + .short 0x99b + .short 0xf665 + .short 0x1de + .short 0xfe22 + .short 0xc95 + .short 0xf36b + .short 0xbcd + .short 0xf433 + .short 0x3e4 + .short 0xfc1c + .short 0x3df + .short 0xfc21 + .short 0x3be + .short 0xfc42 + .short 0x74d + .short 0xf8b3 + .short 0x5f2 + .short 0xfa0e + .short 0x65c + .short 0xf9a4 +#ifndef __APPLE__ +.text +.globl kyber_basemul_mont +.type kyber_basemul_mont,@function +.align 2 +kyber_basemul_mont: +#else +.section __TEXT,__text +.globl _kyber_basemul_mont +.p2align 2 +_kyber_basemul_mont: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_mul + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul +#else + adrp x3, L_kyber_aarch64_zetas_mul@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q1, [x4] + ldp q2, q3, [x1] + ldp q4, q5, [x1, #32] + ldp q6, q7, [x1, #64] + ldp q8, q9, [x1, #96] + ldp q10, q11, [x2] + ldp q12, q13, [x2, #32] + ldp q14, q15, [x2, #64] + ldp q16, q17, [x2, #96] + ldr q0, [x3] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0] + ldr q0, [x3, #16] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #32] + ldr q0, [x3, #32] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #64] + ldr q0, [x3, #48] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #96] + ldp q2, q3, [x1, #128] + ldp q4, q5, [x1, #160] + ldp q6, q7, [x1, #192] + ldp q8, q9, [x1, #224] + ldp q10, q11, [x2, #128] + ldp q12, q13, [x2, #160] + ldp q14, q15, [x2, #192] + ldp q16, q17, [x2, #224] + ldr q0, [x3, #64] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #128] + ldr q0, [x3, #80] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #160] + ldr q0, [x3, #96] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #192] + ldr q0, [x3, #112] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #224] + ldp q2, q3, [x1, #256] + ldp q4, q5, [x1, #288] + ldp q6, q7, [x1, #320] + ldp q8, q9, [x1, #352] + ldp q10, q11, [x2, #256] + ldp q12, q13, [x2, #288] + ldp q14, q15, [x2, #320] + ldp q16, q17, [x2, #352] + ldr q0, [x3, #128] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #256] + ldr q0, [x3, #144] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #288] + ldr q0, [x3, #160] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #320] + ldr q0, [x3, #176] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #352] + ldp q2, q3, [x1, #384] + ldp q4, q5, [x1, #416] + ldp q6, q7, [x1, #448] + ldp q8, q9, [x1, #480] + ldp q10, q11, [x2, #384] + ldp q12, q13, [x2, #416] + ldp q14, q15, [x2, #448] + ldp q16, q17, [x2, #480] + ldr q0, [x3, #192] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #384] + ldr q0, [x3, #208] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #416] + ldr q0, [x3, #224] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #448] + ldr q0, [x3, #240] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #480] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_basemul_mont,.-kyber_basemul_mont +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_basemul_mont_add +.type kyber_basemul_mont_add,@function +.align 2 +kyber_basemul_mont_add: +#else +.section __TEXT,__text +.globl _kyber_basemul_mont_add +.p2align 2 +_kyber_basemul_mont_add: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_mul + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul +#else + adrp x3, L_kyber_aarch64_zetas_mul@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q1, [x4] + ldp q2, q3, [x1] + ldp q4, q5, [x1, #32] + ldp q6, q7, [x1, #64] + ldp q8, q9, [x1, #96] + ldp q10, q11, [x2] + ldp q12, q13, [x2, #32] + ldp q14, q15, [x2, #64] + ldp q16, q17, [x2, #96] + ldp q28, q29, [x0] + ldr q0, [x3] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0] + ldp q28, q29, [x0, #32] + ldr q0, [x3, #16] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #32] + ldp q28, q29, [x0, #64] + ldr q0, [x3, #32] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #64] + ldp q28, q29, [x0, #96] + ldr q0, [x3, #48] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #96] + ldp q2, q3, [x1, #128] + ldp q4, q5, [x1, #160] + ldp q6, q7, [x1, #192] + ldp q8, q9, [x1, #224] + ldp q10, q11, [x2, #128] + ldp q12, q13, [x2, #160] + ldp q14, q15, [x2, #192] + ldp q16, q17, [x2, #224] + ldp q28, q29, [x0, #128] + ldr q0, [x3, #64] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #128] + ldp q28, q29, [x0, #160] + ldr q0, [x3, #80] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #160] + ldp q28, q29, [x0, #192] + ldr q0, [x3, #96] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #192] + ldp q28, q29, [x0, #224] + ldr q0, [x3, #112] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #224] + ldp q2, q3, [x1, #256] + ldp q4, q5, [x1, #288] + ldp q6, q7, [x1, #320] + ldp q8, q9, [x1, #352] + ldp q10, q11, [x2, #256] + ldp q12, q13, [x2, #288] + ldp q14, q15, [x2, #320] + ldp q16, q17, [x2, #352] + ldp q28, q29, [x0, #256] + ldr q0, [x3, #128] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #256] + ldp q28, q29, [x0, #288] + ldr q0, [x3, #144] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #288] + ldp q28, q29, [x0, #320] + ldr q0, [x3, #160] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #320] + ldp q28, q29, [x0, #352] + ldr q0, [x3, #176] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #352] + ldp q2, q3, [x1, #384] + ldp q4, q5, [x1, #416] + ldp q6, q7, [x1, #448] + ldp q8, q9, [x1, #480] + ldp q10, q11, [x2, #384] + ldp q12, q13, [x2, #416] + ldp q14, q15, [x2, #448] + ldp q16, q17, [x2, #480] + ldp q28, q29, [x0, #384] + ldr q0, [x3, #192] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #384] + ldp q28, q29, [x0, #416] + ldr q0, [x3, #208] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #416] + ldp q28, q29, [x0, #448] + ldr q0, [x3, #224] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #448] + ldp q28, q29, [x0, #480] + ldr q0, [x3, #240] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #480] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_basemul_mont_add,.-kyber_basemul_mont_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_csubq_neon +.type kyber_csubq_neon,@function +.align 2 +kyber_csubq_neon: +#else +.section __TEXT,__text +.globl _kyber_csubq_neon +.p2align 2 +_kyber_csubq_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x1, L_kyber_aarch64_q + add x1, x1, :lo12:L_kyber_aarch64_q +#else + adrp x1, L_kyber_aarch64_q@PAGE + add x1, x1, :lo12:L_kyber_aarch64_q@PAGEOFF +#endif /* __APPLE__ */ + ldr q20, [x1] + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + sub x0, x0, #0x100 + sub v0.8h, v0.8h, v20.8h + sub v1.8h, v1.8h, v20.8h + sub v2.8h, v2.8h, v20.8h + sub v3.8h, v3.8h, v20.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v20.8h + sub v6.8h, v6.8h, v20.8h + sub v7.8h, v7.8h, v20.8h + sub v8.8h, v8.8h, v20.8h + sub v9.8h, v9.8h, v20.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v20.8h + sub v12.8h, v12.8h, v20.8h + sub v13.8h, v13.8h, v20.8h + sub v14.8h, v14.8h, v20.8h + sub v15.8h, v15.8h, v20.8h + sshr v16.8h, v0.8h, #15 + sshr v17.8h, v1.8h, #15 + sshr v18.8h, v2.8h, #15 + sshr v19.8h, v3.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sshr v16.8h, v4.8h, #15 + sshr v17.8h, v5.8h, #15 + sshr v18.8h, v6.8h, #15 + sshr v19.8h, v7.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v4.8h, v4.8h, v16.8h + add v5.8h, v5.8h, v17.8h + add v6.8h, v6.8h, v18.8h + add v7.8h, v7.8h, v19.8h + sshr v16.8h, v8.8h, #15 + sshr v17.8h, v9.8h, #15 + sshr v18.8h, v10.8h, #15 + sshr v19.8h, v11.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v8.8h, v8.8h, v16.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + add v11.8h, v11.8h, v19.8h + sshr v16.8h, v12.8h, #15 + sshr v17.8h, v13.8h, #15 + sshr v18.8h, v14.8h, #15 + sshr v19.8h, v15.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v12.8h, v12.8h, v16.8h + add v13.8h, v13.8h, v17.8h + add v14.8h, v14.8h, v18.8h + add v15.8h, v15.8h, v19.8h + st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + sub x0, x0, #0x100 + sub v0.8h, v0.8h, v20.8h + sub v1.8h, v1.8h, v20.8h + sub v2.8h, v2.8h, v20.8h + sub v3.8h, v3.8h, v20.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v20.8h + sub v6.8h, v6.8h, v20.8h + sub v7.8h, v7.8h, v20.8h + sub v8.8h, v8.8h, v20.8h + sub v9.8h, v9.8h, v20.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v20.8h + sub v12.8h, v12.8h, v20.8h + sub v13.8h, v13.8h, v20.8h + sub v14.8h, v14.8h, v20.8h + sub v15.8h, v15.8h, v20.8h + sshr v16.8h, v0.8h, #15 + sshr v17.8h, v1.8h, #15 + sshr v18.8h, v2.8h, #15 + sshr v19.8h, v3.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sshr v16.8h, v4.8h, #15 + sshr v17.8h, v5.8h, #15 + sshr v18.8h, v6.8h, #15 + sshr v19.8h, v7.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v4.8h, v4.8h, v16.8h + add v5.8h, v5.8h, v17.8h + add v6.8h, v6.8h, v18.8h + add v7.8h, v7.8h, v19.8h + sshr v16.8h, v8.8h, #15 + sshr v17.8h, v9.8h, #15 + sshr v18.8h, v10.8h, #15 + sshr v19.8h, v11.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v8.8h, v8.8h, v16.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + add v11.8h, v11.8h, v19.8h + sshr v16.8h, v12.8h, #15 + sshr v17.8h, v13.8h, #15 + sshr v18.8h, v14.8h, #15 + sshr v19.8h, v15.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v12.8h, v12.8h, v16.8h + add v13.8h, v13.8h, v17.8h + add v14.8h, v14.8h, v18.8h + add v15.8h, v15.8h, v19.8h + st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_csubq_neon,.-kyber_csubq_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_add_reduce +.type kyber_add_reduce,@function +.align 2 +kyber_add_reduce: +#else +.section __TEXT,__text +.globl _kyber_add_reduce +.p2align 2 +_kyber_add_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_consts + add x2, x2, :lo12:L_kyber_aarch64_consts +#else + adrp x2, L_kyber_aarch64_consts@PAGE + add x2, x2, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_add_reduce,.-kyber_add_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_add3_reduce +.type kyber_add3_reduce,@function +.align 2 +kyber_add3_reduce: +#else +.section __TEXT,__text +.globl _kyber_add3_reduce +.p2align 2 +_kyber_add3_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_consts + add x3, x3, :lo12:L_kyber_aarch64_consts +#else + adrp x3, L_kyber_aarch64_consts@PAGE + add x3, x3, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x3] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_add3_reduce,.-kyber_add3_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_rsub_reduce +.type kyber_rsub_reduce,@function +.align 2 +kyber_rsub_reduce: +#else +.section __TEXT,__text +.globl _kyber_rsub_reduce +.p2align 2 +_kyber_rsub_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_consts + add x2, x2, :lo12:L_kyber_aarch64_consts +#else + adrp x2, L_kyber_aarch64_consts@PAGE + add x2, x2, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_rsub_reduce,.-kyber_rsub_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_to_mont +.type kyber_to_mont,@function +.align 2 +kyber_to_mont: +#else +.section __TEXT,__text +.globl _kyber_to_mont +.p2align 2 +_kyber_to_mont: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x1, L_kyber_aarch64_consts + add x1, x1, :lo12:L_kyber_aarch64_consts +#else + adrp x1, L_kyber_aarch64_consts@PAGE + add x1, x1, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x1] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_to_mont,.-kyber_to_mont +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_low, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_low, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_low: + .short 0x373 + .short 0x373 + .short 0x373 + .short 0x373 + .short 0x373 + .short 0x373 + .short 0x373 + .short 0x373 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_high, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_high, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_high: + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 + .short 0x9c0 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_bits: + .short 0x1 + .short 0x2 + .short 0x4 + .short 0x8 + .short 0x10 + .short 0x20 + .short 0x40 + .short 0x80 +#ifndef __APPLE__ +.text +.globl kyber_to_msg_neon +.type kyber_to_msg_neon,@function +.align 2 +kyber_to_msg_neon: +#else +.section __TEXT,__text +.globl _kyber_to_msg_neon +.p2align 2 +_kyber_to_msg_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_to_msg_neon_low + add x2, x2, :lo12:L_kyber_aarch64_to_msg_neon_low +#else + adrp x2, L_kyber_aarch64_to_msg_neon_low@PAGE + add x2, x2, :lo12:L_kyber_aarch64_to_msg_neon_low@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_to_msg_neon_high + add x3, x3, :lo12:L_kyber_aarch64_to_msg_neon_high +#else + adrp x3, L_kyber_aarch64_to_msg_neon_high@PAGE + add x3, x3, :lo12:L_kyber_aarch64_to_msg_neon_high@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_to_msg_neon_bits + add x4, x4, :lo12:L_kyber_aarch64_to_msg_neon_bits +#else + adrp x4, L_kyber_aarch64_to_msg_neon_bits@PAGE + add x4, x4, :lo12:L_kyber_aarch64_to_msg_neon_bits@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ldr q1, [x3] + ldr q26, [x4] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_to_msg_neon,.-kyber_to_msg_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_from_msg_neon_q1half, %object + .section .rodata + .size L_kyber_aarch64_from_msg_neon_q1half, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_from_msg_neon_q1half: + .short 0x681 + .short 0x681 + .short 0x681 + .short 0x681 + .short 0x681 + .short 0x681 + .short 0x681 + .short 0x681 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_from_msg_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_from_msg_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_kyber_aarch64_from_msg_neon_bits: + .byte 0x1 + .byte 0x2 + .byte 0x4 + .byte 0x8 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + .byte 0x1 + .byte 0x2 + .byte 0x4 + .byte 0x8 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 +#ifndef __APPLE__ +.text +.globl kyber_from_msg_neon +.type kyber_from_msg_neon,@function +.align 2 +kyber_from_msg_neon: +#else +.section __TEXT,__text +.globl _kyber_from_msg_neon +.p2align 2 +_kyber_from_msg_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_from_msg_neon_q1half + add x2, x2, :lo12:L_kyber_aarch64_from_msg_neon_q1half +#else + adrp x2, L_kyber_aarch64_from_msg_neon_q1half@PAGE + add x2, x2, :lo12:L_kyber_aarch64_from_msg_neon_q1half@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_from_msg_neon_bits + add x3, x3, :lo12:L_kyber_aarch64_from_msg_neon_bits +#else + adrp x3, L_kyber_aarch64_from_msg_neon_bits@PAGE + add x3, x3, :lo12:L_kyber_aarch64_from_msg_neon_bits@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v2.16b, v3.16b}, [x1] + ldr q1, [x2] + ldr q0, [x3] + dup v4.8b, v2.b[0] + dup v5.8b, v2.b[1] + dup v6.8b, v2.b[2] + dup v7.8b, v2.b[3] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[4] + dup v5.8b, v2.b[5] + dup v6.8b, v2.b[6] + dup v7.8b, v2.b[7] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[8] + dup v5.8b, v2.b[9] + dup v6.8b, v2.b[10] + dup v7.8b, v2.b[11] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[12] + dup v5.8b, v2.b[13] + dup v6.8b, v2.b[14] + dup v7.8b, v2.b[15] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[0] + dup v5.8b, v3.b[1] + dup v6.8b, v3.b[2] + dup v7.8b, v3.b[3] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[4] + dup v5.8b, v3.b[5] + dup v6.8b, v3.b[6] + dup v7.8b, v3.b[7] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[8] + dup v5.8b, v3.b[9] + dup v6.8b, v3.b[10] + dup v7.8b, v3.b[11] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[12] + dup v5.8b, v3.b[13] + dup v6.8b, v3.b[14] + dup v7.8b, v3.b[15] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size kyber_from_msg_neon,.-kyber_from_msg_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_cmp_neon +.type kyber_cmp_neon,@function +.align 2 +kyber_cmp_neon: +#else +.section __TEXT,__text +.globl _kyber_cmp_neon +.p2align 2 +_kyber_cmp_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v8.16b, v0.16b, v4.16b + eor v9.16b, v1.16b, v5.16b + eor v10.16b, v2.16b, v6.16b + eor v11.16b, v3.16b, v7.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + subs w2, w2, #0x300 + beq L_kyber_aarch64_cmp_neon_done + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + subs w2, w2, #0x140 + beq L_kyber_aarch64_cmp_neon_done + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld2 {v0.16b, v1.16b}, [x0] + ld2 {v4.16b, v5.16b}, [x1] + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b +L_kyber_aarch64_cmp_neon_done: + orr v8.16b, v8.16b, v9.16b + orr v10.16b, v10.16b, v11.16b + orr v8.16b, v8.16b, v10.16b + ins v9.b[0], v8.b[1] + orr v8.16b, v8.16b, v9.16b + mov x0, v8.d[0] + subs x0, x0, xzr + csetm w0, ne + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size kyber_cmp_neon,.-kyber_cmp_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_mask, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_mask, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_mask: + .short 0xfff + .short 0xfff + .short 0xfff + .short 0xfff + .short 0xfff + .short 0xfff + .short 0xfff + .short 0xfff +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_bits: + .short 0x1 + .short 0x2 + .short 0x4 + .short 0x8 + .short 0x10 + .short 0x20 + .short 0x40 + .short 0x80 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_indeces, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_indeces, 4096 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_indeces: + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xff + .byte 0xff + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf + .byte 0xff + .byte 0xff + .byte 0x0 + .byte 0x1 + .byte 0x2 + .byte 0x3 + .byte 0x4 + .byte 0x5 + .byte 0x6 + .byte 0x7 + .byte 0x8 + .byte 0x9 + .byte 0xa + .byte 0xb + .byte 0xc + .byte 0xd + .byte 0xe + .byte 0xf +#ifndef __APPLE__ +.text +.globl kyber_rej_uniform_neon +.type kyber_rej_uniform_neon,@function +.align 2 +kyber_rej_uniform_neon: +#else +.section __TEXT,__text +.globl _kyber_rej_uniform_neon +.p2align 2 +_kyber_rej_uniform_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_rej_uniform_neon_mask + add x4, x4, :lo12:L_kyber_aarch64_rej_uniform_neon_mask +#else + adrp x4, L_kyber_aarch64_rej_uniform_neon_mask@PAGE + add x4, x4, :lo12:L_kyber_aarch64_rej_uniform_neon_mask@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x5, L_kyber_aarch64_q + add x5, x5, :lo12:L_kyber_aarch64_q +#else + adrp x5, L_kyber_aarch64_q@PAGE + add x5, x5, :lo12:L_kyber_aarch64_q@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x6, L_kyber_aarch64_rej_uniform_neon_bits + add x6, x6, :lo12:L_kyber_aarch64_rej_uniform_neon_bits +#else + adrp x6, L_kyber_aarch64_rej_uniform_neon_bits@PAGE + add x6, x6, :lo12:L_kyber_aarch64_rej_uniform_neon_bits@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_kyber_aarch64_rej_uniform_neon_indeces + add x7, x7, :lo12:L_kyber_aarch64_rej_uniform_neon_indeces +#else + adrp x7, L_kyber_aarch64_rej_uniform_neon_indeces@PAGE + add x7, x7, :lo12:L_kyber_aarch64_rej_uniform_neon_indeces@PAGEOFF +#endif /* __APPLE__ */ + eor v1.16b, v1.16b, v1.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor x12, x12, x12 + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + mov x13, #0xd01 + ldr q0, [x4] + ldr q3, [x5] + ldr q2, [x6] + subs wzr, w1, #0 + beq L_kyber_aarch64_rej_uniform_neon_done + subs wzr, w1, #16 + blt L_kyber_aarch64_rej_uniform_neon_loop_4 +L_kyber_aarch64_rej_uniform_neon_loop_16: + ld3 {v4.8b, v5.8b, v6.8b}, [x2], #24 + zip1 v4.16b, v4.16b, v1.16b + zip1 v5.16b, v5.16b, v1.16b + zip1 v6.16b, v6.16b, v1.16b + shl v7.8h, v5.8h, #8 + ushr v8.8h, v5.8h, #4 + shl v6.8h, v6.8h, #4 + orr v4.16b, v4.16b, v7.16b + orr v5.16b, v8.16b, v6.16b + and v7.16b, v4.16b, v0.16b + and v8.16b, v5.16b, v0.16b + zip1 v4.8h, v7.8h, v8.8h + zip2 v5.8h, v7.8h, v8.8h + cmgt v7.8h, v3.8h, v4.8h + cmgt v8.8h, v3.8h, v5.8h + ushr v12.8h, v7.8h, #15 + ushr v13.8h, v8.8h, #15 + addv h12, v12.8h + addv h13, v13.8h + mov x10, v12.d[0] + mov x11, v13.d[0] + and v10.16b, v7.16b, v2.16b + and v11.16b, v8.16b, v2.16b + addv h10, v10.8h + addv h11, v11.8h + mov w8, v10.s[0] + mov w9, v11.s[0] + lsl w8, w8, #4 + lsl w9, w9, #4 + ldr q10, [x7, x8] + ldr q11, [x7, x9] + tbl v7.16b, {v4.16b}, v10.16b + tbl v8.16b, {v5.16b}, v11.16b + str q7, [x0] + add x0, x0, x10, lsl 1 + add x12, x12, x10 + str q8, [x0] + add x0, x0, x11, lsl 1 + add x12, x12, x11 + subs w3, w3, #24 + beq L_kyber_aarch64_rej_uniform_neon_done + sub w10, w1, w12 + subs x10, x10, #16 + blt L_kyber_aarch64_rej_uniform_neon_loop_4 + b L_kyber_aarch64_rej_uniform_neon_loop_16 +L_kyber_aarch64_rej_uniform_neon_loop_4: + subs w10, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + subs x10, x10, #4 + blt L_kyber_aarch64_rej_uniform_neon_loop_lt_4 + ldr x4, [x2], #6 + lsr x5, x4, #12 + lsr x6, x4, #24 + lsr x7, x4, #36 + and x4, x4, #0xfff + and x5, x5, #0xfff + and x6, x6, #0xfff + and x7, x7, #0xfff + strh w4, [x0] + subs xzr, x4, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w5, [x0] + subs xzr, x5, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w6, [x0] + subs xzr, x6, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w7, [x0] + subs xzr, x7, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs w3, w3, #6 + beq L_kyber_aarch64_rej_uniform_neon_done + b L_kyber_aarch64_rej_uniform_neon_loop_4 +L_kyber_aarch64_rej_uniform_neon_loop_lt_4: + ldr x4, [x2], #6 + lsr x5, x4, #12 + lsr x6, x4, #24 + lsr x7, x4, #36 + and x4, x4, #0xfff + and x5, x5, #0xfff + and x6, x6, #0xfff + and x7, x7, #0xfff + strh w4, [x0] + subs xzr, x4, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w5, [x0] + subs xzr, x5, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w6, [x0] + subs xzr, x6, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w7, [x0] + subs xzr, x7, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + subs w3, w3, #6 + beq L_kyber_aarch64_rej_uniform_neon_done + b L_kyber_aarch64_rej_uniform_neon_loop_lt_4 +L_kyber_aarch64_rej_uniform_neon_done: + mov x0, x12 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp x29, x30, [sp], #0x40 + ret +#ifndef __APPLE__ + .size kyber_rej_uniform_neon,.-kyber_rej_uniform_neon +#endif /* __APPLE__ */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifndef __APPLE__ + .text + .type L_SHA3_transform_blocksx3_neon_r, %object + .section .rodata + .size L_SHA3_transform_blocksx3_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_transform_blocksx3_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_sha3_blocksx3_neon +.type kyber_sha3_blocksx3_neon,@function +.align 2 +kyber_sha3_blocksx3_neon: +#else +.section __TEXT,__text +.globl _kyber_sha3_blocksx3_neon +.p2align 2 +_kyber_sha3_blocksx3_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x27, L_SHA3_transform_blocksx3_neon_r + add x27, x27, :lo12:L_SHA3_transform_blocksx3_neon_r +#else + adrp x27, L_SHA3_transform_blocksx3_neon_r@PAGE + add x27, x27, :lo12:L_SHA3_transform_blocksx3_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + ld1 {v24.d}[0], [x0] + add x0, x0, #8 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + ld1 {v24.d}[1], [x0] + add x0, x0, #8 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_blocksx3_neon_begin: + stp x27, x28, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x5, x10 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x1, x6 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x3, x8 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x15 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x11 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x13 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x21 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x16 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x19 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x26 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x22 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x24 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x27, x2, x7 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x4, x9 + xar v1.2d, v6.2d, v26.2d, #20 + eor x27, x27, x12 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x14 + xar v9.2d, v22.2d, v27.2d, #3 + eor x27, x27, x17 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x20 + xar v14.2d, v20.2d, v25.2d, #46 + eor x27, x27, x23 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x25 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x27, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x27, x27, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x1, x1, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x6, x6, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x11, x11, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x16, x16, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x22, x22, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x3, x3, x27 + xar v21.2d, v8.2d, v28.2d, #9 + eor x8, x8, x27 + xar v8.2d, v16.2d, v26.2d, #19 + eor x13, x13, x27 + xar v16.2d, v5.2d, v25.2d, #28 + eor x19, x19, x27 + xar v5.2d, v3.2d, v28.2d, #36 + eor x24, x24, x27 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x27, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x27, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x27, x27, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x5, x5, x28 + mov v26.16b, v1.16b + eor x10, x10, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x15, x15, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x21, x21, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x26, x26, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x2, x2, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x7, x7, x30 + mov v25.16b, v5.16b + eor x12, x12, x30 + mov v26.16b, v6.16b + eor x17, x17, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x23, x23, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x4, x4, x27 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x9, x9, x27 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x14, x14, x27 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x20, x20, x27 + mov v26.16b, v11.16b + eor x25, x25, x27 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x2, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x2, x7, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x7, x10, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x10, x24, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x24, x15, #25 + mov v25.16b, v15.16b + ror x15, x22, #46 + mov v26.16b, v16.16b + ror x22, x3, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x3, x13, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x13, x14, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x14, x21, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x21, x25, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x25, x16, #23 + mov v25.16b, v20.16b + ror x16, x5, #37 + mov v26.16b, v21.16b + ror x5, x26, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x26, x23, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x23, x9, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x9, x17, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x17, x6, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x6, x4, #36 + ror x4, x20, #43 + ror x20, x19, #49 + ror x19, x12, #54 + ror x12, x8, #58 + ror x8, x11, #61 + # Row Mix Base + bic x11, x3, x2 + bic x27, x4, x3 + bic x28, x1, x5 + bic x30, x2, x1 + eor x1, x1, x11 + eor x2, x2, x27 + bic x11, x5, x4 + eor x4, x4, x28 + eor x3, x3, x11 + eor x5, x5, x30 + bic x11, x8, x7 + bic x27, x9, x8 + bic x28, x6, x10 + bic x30, x7, x6 + eor x6, x6, x11 + eor x7, x7, x27 + bic x11, x10, x9 + eor x9, x9, x28 + eor x8, x8, x11 + eor x10, x10, x30 + bic x11, x13, x12 + bic x27, x14, x13 + bic x28, x0, x15 + bic x30, x12, x0 + eor x11, x0, x11 + eor x12, x12, x27 + bic x0, x15, x14 + eor x14, x14, x28 + eor x13, x13, x0 + eor x15, x15, x30 + bic x0, x19, x17 + bic x27, x20, x19 + bic x28, x16, x21 + bic x30, x17, x16 + eor x16, x16, x0 + eor x17, x17, x27 + bic x0, x21, x20 + eor x20, x20, x28 + eor x19, x19, x0 + eor x21, x21, x30 + bic x0, x24, x23 + bic x27, x25, x24 + bic x28, x22, x26 + bic x30, x23, x22 + eor x22, x22, x0 + eor x23, x23, x27 + bic x0, x26, x25 + eor x25, x25, x28 + eor x24, x24, x0 + eor x26, x26, x30 + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x1, x1, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_transform_blocksx3_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_sha3_blocksx3_neon,.-kyber_sha3_blocksx3_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_SHA3_shake128_blocksx3_seed_neon_r, %object + .section .rodata + .size L_SHA3_shake128_blocksx3_seed_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_shake128_blocksx3_seed_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_shake128_blocksx3_seed_neon +.type kyber_shake128_blocksx3_seed_neon,@function +.align 2 +kyber_shake128_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake128_blocksx3_seed_neon +.p2align 2 +_kyber_shake128_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_SHA3_shake128_blocksx3_seed_neon_r + add x28, x28, :lo12:L_SHA3_shake128_blocksx3_seed_neon_r +#else + adrp x28, L_SHA3_shake128_blocksx3_seed_neon_r@PAGE + add x28, x28, :lo12:L_SHA3_shake128_blocksx3_seed_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + eor v16.16b, v16.16b, v16.16b + eor x19, x19, x19 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + movz x23, #0x8000, lsl 48 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v20.2d, x23 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake128_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x6, x11 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x2, x7 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x4, x9 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x16 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x12 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x14 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x22 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x17 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x20 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x27 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x23 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x25 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x1, x3, x8 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x5, x10 + xar v1.2d, v6.2d, v26.2d, #20 + eor x1, x1, x13 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x15 + xar v9.2d, v22.2d, v27.2d, #3 + eor x1, x1, x19 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x21 + xar v14.2d, v20.2d, v25.2d, #46 + eor x1, x1, x24 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x26 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x1, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x1, x1, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x2, x2, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x7, x7, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x12, x12, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x17, x17, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x23, x23, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x4, x4, x1 + xar v21.2d, v8.2d, v28.2d, #9 + eor x9, x9, x1 + xar v8.2d, v16.2d, v26.2d, #19 + eor x14, x14, x1 + xar v16.2d, v5.2d, v25.2d, #28 + eor x20, x20, x1 + xar v5.2d, v3.2d, v28.2d, #36 + eor x25, x25, x1 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x1, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x1, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x1, x1, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x6, x6, x28 + mov v26.16b, v1.16b + eor x11, x11, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x16, x16, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x22, x22, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x27, x27, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x3, x3, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x8, x8, x30 + mov v25.16b, v5.16b + eor x13, x13, x30 + mov v26.16b, v6.16b + eor x19, x19, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x24, x24, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x5, x5, x1 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x10, x10, x1 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x15, x15, x1 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x21, x21, x1 + mov v26.16b, v11.16b + eor x26, x26, x1 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x3, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x3, x8, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x8, x11, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x11, x25, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x25, x16, #25 + mov v25.16b, v15.16b + ror x16, x23, #46 + mov v26.16b, v16.16b + ror x23, x4, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x4, x14, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x14, x15, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x15, x22, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x22, x26, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x26, x17, #23 + mov v25.16b, v20.16b + ror x17, x6, #37 + mov v26.16b, v21.16b + ror x6, x27, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x27, x24, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x24, x10, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x10, x19, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x19, x7, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x7, x5, #36 + ror x5, x21, #43 + ror x21, x20, #49 + ror x20, x13, #54 + ror x13, x9, #58 + ror x9, x12, #61 + # Row Mix Base + bic x12, x4, x3 + bic x1, x5, x4 + bic x28, x2, x6 + bic x30, x3, x2 + eor x2, x2, x12 + eor x3, x3, x1 + bic x12, x6, x5 + eor x5, x5, x28 + eor x4, x4, x12 + eor x6, x6, x30 + bic x12, x9, x8 + bic x1, x10, x9 + bic x28, x7, x11 + bic x30, x8, x7 + eor x7, x7, x12 + eor x8, x8, x1 + bic x12, x11, x10 + eor x10, x10, x28 + eor x9, x9, x12 + eor x11, x11, x30 + bic x12, x14, x13 + bic x1, x15, x14 + bic x28, x0, x16 + bic x30, x13, x0 + eor x12, x0, x12 + eor x13, x13, x1 + bic x0, x16, x15 + eor x15, x15, x28 + eor x14, x14, x0 + eor x16, x16, x30 + bic x0, x20, x19 + bic x1, x21, x20 + bic x28, x17, x22 + bic x30, x19, x17 + eor x17, x17, x0 + eor x19, x19, x1 + bic x0, x22, x21 + eor x21, x21, x28 + eor x20, x20, x0 + eor x22, x22, x30 + bic x0, x25, x24 + bic x1, x26, x25 + bic x28, x23, x27 + bic x30, x24, x23 + eor x23, x23, x0 + eor x24, x24, x1 + bic x0, x27, x26 + eor x26, x26, x28 + eor x25, x25, x0 + eor x27, x27, x30 + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake128_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake128_blocksx3_seed_neon,.-kyber_shake128_blocksx3_seed_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_SHA3_shake256_blocksx3_seed_neon_r, %object + .section .rodata + .size L_SHA3_shake256_blocksx3_seed_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_shake256_blocksx3_seed_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_shake256_blocksx3_seed_neon +.type kyber_shake256_blocksx3_seed_neon,@function +.align 2 +kyber_shake256_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake256_blocksx3_seed_neon +.p2align 2 +_kyber_shake256_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_SHA3_shake256_blocksx3_seed_neon_r + add x28, x28, :lo12:L_SHA3_shake256_blocksx3_seed_neon_r +#else + adrp x28, L_SHA3_shake256_blocksx3_seed_neon_r@PAGE + add x28, x28, :lo12:L_SHA3_shake256_blocksx3_seed_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + movz x19, #0x8000, lsl 48 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + eor v20.16b, v20.16b, v20.16b + eor x23, x23, x23 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v16.2d, x19 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake256_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x6, x11 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x2, x7 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x4, x9 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x16 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x12 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x14 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x22 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x17 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x20 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x27 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x23 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x25 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x1, x3, x8 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x5, x10 + xar v1.2d, v6.2d, v26.2d, #20 + eor x1, x1, x13 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x15 + xar v9.2d, v22.2d, v27.2d, #3 + eor x1, x1, x19 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x21 + xar v14.2d, v20.2d, v25.2d, #46 + eor x1, x1, x24 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x26 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x1, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x1, x1, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x2, x2, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x7, x7, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x12, x12, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x17, x17, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x23, x23, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x4, x4, x1 + xar v21.2d, v8.2d, v28.2d, #9 + eor x9, x9, x1 + xar v8.2d, v16.2d, v26.2d, #19 + eor x14, x14, x1 + xar v16.2d, v5.2d, v25.2d, #28 + eor x20, x20, x1 + xar v5.2d, v3.2d, v28.2d, #36 + eor x25, x25, x1 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x1, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x1, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x1, x1, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x6, x6, x28 + mov v26.16b, v1.16b + eor x11, x11, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x16, x16, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x22, x22, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x27, x27, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x3, x3, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x8, x8, x30 + mov v25.16b, v5.16b + eor x13, x13, x30 + mov v26.16b, v6.16b + eor x19, x19, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x24, x24, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x5, x5, x1 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x10, x10, x1 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x15, x15, x1 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x21, x21, x1 + mov v26.16b, v11.16b + eor x26, x26, x1 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x3, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x3, x8, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x8, x11, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x11, x25, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x25, x16, #25 + mov v25.16b, v15.16b + ror x16, x23, #46 + mov v26.16b, v16.16b + ror x23, x4, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x4, x14, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x14, x15, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x15, x22, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x22, x26, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x26, x17, #23 + mov v25.16b, v20.16b + ror x17, x6, #37 + mov v26.16b, v21.16b + ror x6, x27, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x27, x24, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x24, x10, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x10, x19, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x19, x7, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x7, x5, #36 + ror x5, x21, #43 + ror x21, x20, #49 + ror x20, x13, #54 + ror x13, x9, #58 + ror x9, x12, #61 + # Row Mix Base + bic x12, x4, x3 + bic x1, x5, x4 + bic x28, x2, x6 + bic x30, x3, x2 + eor x2, x2, x12 + eor x3, x3, x1 + bic x12, x6, x5 + eor x5, x5, x28 + eor x4, x4, x12 + eor x6, x6, x30 + bic x12, x9, x8 + bic x1, x10, x9 + bic x28, x7, x11 + bic x30, x8, x7 + eor x7, x7, x12 + eor x8, x8, x1 + bic x12, x11, x10 + eor x10, x10, x28 + eor x9, x9, x12 + eor x11, x11, x30 + bic x12, x14, x13 + bic x1, x15, x14 + bic x28, x0, x16 + bic x30, x13, x0 + eor x12, x0, x12 + eor x13, x13, x1 + bic x0, x16, x15 + eor x15, x15, x28 + eor x14, x14, x0 + eor x16, x16, x30 + bic x0, x20, x19 + bic x1, x21, x20 + bic x28, x17, x22 + bic x30, x19, x17 + eor x17, x17, x0 + eor x19, x19, x1 + bic x0, x22, x21 + eor x21, x21, x28 + eor x20, x20, x0 + eor x22, x22, x30 + bic x0, x25, x24 + bic x1, x26, x25 + bic x28, x23, x27 + bic x30, x24, x23 + eor x23, x23, x0 + eor x24, x24, x1 + bic x0, x27, x26 + eor x26, x26, x28 + eor x25, x25, x0 + eor x27, x27, x30 + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake256_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake256_blocksx3_seed_neon,.-kyber_shake256_blocksx3_seed_neon +#endif /* __APPLE__ */ +#else +#ifndef __APPLE__ + .text + .type L_SHA3_transform_blocksx3_neon_r, %object + .section .rodata + .size L_SHA3_transform_blocksx3_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_transform_blocksx3_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_sha3_blocksx3_neon +.type kyber_sha3_blocksx3_neon,@function +.align 2 +kyber_sha3_blocksx3_neon: +#else +.section __TEXT,__text +.globl _kyber_sha3_blocksx3_neon +.p2align 2 +_kyber_sha3_blocksx3_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x27, L_SHA3_transform_blocksx3_neon_r + add x27, x27, :lo12:L_SHA3_transform_blocksx3_neon_r +#else + adrp x27, L_SHA3_transform_blocksx3_neon_r@PAGE + add x27, x27, :lo12:L_SHA3_transform_blocksx3_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + ld1 {v24.d}[0], [x0] + add x0, x0, #8 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + ld1 {v24.d}[1], [x0] + add x0, x0, #8 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_blocksx3_neon_begin: + stp x27, x28, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x5, x10 + eor v27.16b, v1.16b, v6.16b + eor x30, x1, x6 + eor v30.16b, v30.16b, v14.16b + eor x28, x3, x8 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x15 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x11 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x13 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x21 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x16 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x19 + sli v25.2d, v27.2d, #1 + eor x0, x0, x26 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x22 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x24 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x27, x2, x7 + eor v31.16b, v31.16b, v15.16b + eor x28, x4, x9 + eor v28.16b, v28.16b, v17.16b + eor x27, x27, x12 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x14 + eor v28.16b, v28.16b, v22.16b + eor x27, x27, x17 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x20 + ushr v26.2d, v28.2d, #63 + eor x27, x27, x23 + sli v29.2d, v30.2d, #1 + eor x28, x28, x25 + sli v26.2d, v28.2d, #1 + eor x0, x0, x27, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x27, x27, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x1, x1, x0 + eor v26.16b, v26.16b, v31.16b + eor x6, x6, x0 + eor v29.16b, v29.16b, v13.16b + eor x11, x11, x0 + eor v29.16b, v29.16b, v18.16b + eor x16, x16, x0 + eor v29.16b, v29.16b, v23.16b + eor x22, x22, x0 + ushr v30.2d, v29.2d, #63 + eor x3, x3, x27 + sli v30.2d, v29.2d, #1 + eor x8, x8, x27 + eor v27.16b, v27.16b, v30.16b + eor x13, x13, x27 + ushr v30.2d, v31.2d, #63 + eor x19, x19, x27 + sli v30.2d, v31.2d, #1 + eor x24, x24, x27 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x27, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x27, ror 63 + ushr v1.2d, v6.2d, #20 + eor x27, x27, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x5, x5, x28 + sli v1.2d, v6.2d, #44 + eor x10, x10, x28 + eor v31.16b, v9.16b, v29.16b + eor x15, x15, x28 + eor v22.16b, v22.16b, v27.16b + eor x21, x21, x28 + ushr v6.2d, v31.2d, #44 + eor x26, x26, x28 + ushr v9.2d, v22.2d, #3 + eor x2, x2, x30 + sli v6.2d, v31.2d, #20 + eor x7, x7, x30 + sli v9.2d, v22.2d, #61 + eor x12, x12, x30 + eor v31.16b, v14.16b, v29.16b + eor x17, x17, x30 + eor v20.16b, v20.16b, v25.16b + eor x23, x23, x30 + ushr v22.2d, v31.2d, #25 + eor x4, x4, x27 + ushr v14.2d, v20.2d, #46 + eor x9, x9, x27 + sli v22.2d, v31.2d, #39 + eor x14, x14, x27 + sli v14.2d, v20.2d, #18 + eor x20, x20, x27 + eor v31.16b, v2.16b, v27.16b + eor x25, x25, x27 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x2, #63 + ushr v20.2d, v31.2d, #2 + ror x2, x7, #20 + ushr v2.2d, v12.2d, #21 + ror x7, x10, #44 + sli v20.2d, v31.2d, #62 + ror x10, x24, #3 + sli v2.2d, v12.2d, #43 + ror x24, x15, #25 + eor v31.16b, v13.16b, v28.16b + ror x15, x22, #46 + eor v19.16b, v19.16b, v29.16b + ror x22, x3, #2 + ushr v12.2d, v31.2d, #39 + ror x3, x13, #21 + ushr v13.2d, v19.2d, #56 + ror x13, x14, #39 + sli v12.2d, v31.2d, #25 + ror x14, x21, #56 + sli v13.2d, v19.2d, #8 + ror x21, x25, #8 + eor v31.16b, v23.16b, v28.16b + ror x25, x16, #23 + eor v15.16b, v15.16b, v25.16b + ror x16, x5, #37 + ushr v19.2d, v31.2d, #8 + ror x5, x26, #50 + ushr v23.2d, v15.2d, #23 + ror x26, x23, #62 + sli v19.2d, v31.2d, #56 + ror x23, x9, #9 + sli v23.2d, v15.2d, #41 + ror x9, x17, #19 + eor v31.16b, v4.16b, v29.16b + ror x17, x6, #28 + eor v24.16b, v24.16b, v29.16b + ror x6, x4, #36 + ushr v15.2d, v31.2d, #37 + ror x4, x20, #43 + ushr v4.2d, v24.2d, #50 + ror x20, x19, #49 + sli v15.2d, v31.2d, #27 + ror x19, x12, #54 + sli v4.2d, v24.2d, #14 + ror x12, x8, #58 + eor v31.16b, v21.16b, v26.16b + ror x8, x11, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x11, x3, x2 + ushr v24.2d, v31.2d, #62 + bic x27, x4, x3 + ushr v21.2d, v8.2d, #9 + bic x28, x1, x5 + sli v24.2d, v31.2d, #2 + bic x30, x2, x1 + sli v21.2d, v8.2d, #55 + eor x1, x1, x11 + eor v31.16b, v16.16b, v26.16b + eor x2, x2, x27 + eor v5.16b, v5.16b, v25.16b + bic x11, x5, x4 + ushr v8.2d, v31.2d, #19 + eor x4, x4, x28 + ushr v16.2d, v5.2d, #28 + eor x3, x3, x11 + sli v8.2d, v31.2d, #45 + eor x5, x5, x30 + sli v16.2d, v5.2d, #36 + bic x11, x8, x7 + eor v31.16b, v3.16b, v28.16b + bic x27, x9, x8 + eor v18.16b, v18.16b, v28.16b + bic x28, x6, x10 + ushr v5.2d, v31.2d, #36 + bic x30, x7, x6 + ushr v3.2d, v18.2d, #43 + eor x6, x6, x11 + sli v5.2d, v31.2d, #28 + eor x7, x7, x27 + sli v3.2d, v18.2d, #21 + bic x11, x10, x9 + eor v31.16b, v17.16b, v27.16b + eor x9, x9, x28 + eor v11.16b, v11.16b, v26.16b + eor x8, x8, x11 + ushr v18.2d, v31.2d, #49 + eor x10, x10, x30 + ushr v17.2d, v11.2d, #54 + bic x11, x13, x12 + sli v18.2d, v31.2d, #15 + bic x27, x14, x13 + sli v17.2d, v11.2d, #10 + bic x28, x0, x15 + eor v31.16b, v7.16b, v27.16b + bic x30, x12, x0 + eor v10.16b, v10.16b, v25.16b + eor x11, x0, x11 + ushr v11.2d, v31.2d, #58 + eor x12, x12, x27 + ushr v7.2d, v10.2d, #61 + bic x0, x15, x14 + sli v11.2d, v31.2d, #6 + eor x14, x14, x28 + sli v7.2d, v10.2d, #3 + eor x13, x13, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x15, x15, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x19, x17 + bic v27.16b, v4.16b, v3.16b + bic x27, x20, x19 + bic v28.16b, v0.16b, v4.16b + bic x28, x16, x21 + bic v29.16b, v1.16b, v0.16b + bic x30, x17, x16 + eor v0.16b, v0.16b, v25.16b + eor x16, x16, x0 + eor v1.16b, v1.16b, v26.16b + eor x17, x17, x27 + eor v2.16b, v2.16b, v27.16b + bic x0, x21, x20 + eor v3.16b, v3.16b, v28.16b + eor x20, x20, x28 + eor v4.16b, v4.16b, v29.16b + eor x19, x19, x0 + bic v25.16b, v7.16b, v6.16b + eor x21, x21, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x24, x23 + bic v27.16b, v9.16b, v8.16b + bic x27, x25, x24 + bic v28.16b, v5.16b, v9.16b + bic x28, x22, x26 + bic v29.16b, v6.16b, v5.16b + bic x30, x23, x22 + eor v5.16b, v5.16b, v25.16b + eor x22, x22, x0 + eor v6.16b, v6.16b, v26.16b + eor x23, x23, x27 + eor v7.16b, v7.16b, v27.16b + bic x0, x26, x25 + eor v8.16b, v8.16b, v28.16b + eor x25, x25, x28 + eor v9.16b, v9.16b, v29.16b + eor x24, x24, x0 + bic v25.16b, v12.16b, v11.16b + eor x26, x26, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x1, x1, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_transform_blocksx3_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_sha3_blocksx3_neon,.-kyber_sha3_blocksx3_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_SHA3_shake128_blocksx3_seed_neon_r, %object + .section .rodata + .size L_SHA3_shake128_blocksx3_seed_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_shake128_blocksx3_seed_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_shake128_blocksx3_seed_neon +.type kyber_shake128_blocksx3_seed_neon,@function +.align 2 +kyber_shake128_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake128_blocksx3_seed_neon +.p2align 2 +_kyber_shake128_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_SHA3_shake128_blocksx3_seed_neon_r + add x28, x28, :lo12:L_SHA3_shake128_blocksx3_seed_neon_r +#else + adrp x28, L_SHA3_shake128_blocksx3_seed_neon_r@PAGE + add x28, x28, :lo12:L_SHA3_shake128_blocksx3_seed_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + eor v16.16b, v16.16b, v16.16b + eor x19, x19, x19 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + movz x23, #0x8000, lsl 48 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v20.2d, x23 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake128_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x6, x11 + eor v27.16b, v1.16b, v6.16b + eor x30, x2, x7 + eor v30.16b, v30.16b, v14.16b + eor x28, x4, x9 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x16 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x12 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x14 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x22 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x17 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x20 + sli v25.2d, v27.2d, #1 + eor x0, x0, x27 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x23 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x25 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x1, x3, x8 + eor v31.16b, v31.16b, v15.16b + eor x28, x5, x10 + eor v28.16b, v28.16b, v17.16b + eor x1, x1, x13 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x15 + eor v28.16b, v28.16b, v22.16b + eor x1, x1, x19 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x21 + ushr v26.2d, v28.2d, #63 + eor x1, x1, x24 + sli v29.2d, v30.2d, #1 + eor x28, x28, x26 + sli v26.2d, v28.2d, #1 + eor x0, x0, x1, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x1, x1, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x2, x2, x0 + eor v26.16b, v26.16b, v31.16b + eor x7, x7, x0 + eor v29.16b, v29.16b, v13.16b + eor x12, x12, x0 + eor v29.16b, v29.16b, v18.16b + eor x17, x17, x0 + eor v29.16b, v29.16b, v23.16b + eor x23, x23, x0 + ushr v30.2d, v29.2d, #63 + eor x4, x4, x1 + sli v30.2d, v29.2d, #1 + eor x9, x9, x1 + eor v27.16b, v27.16b, v30.16b + eor x14, x14, x1 + ushr v30.2d, v31.2d, #63 + eor x20, x20, x1 + sli v30.2d, v31.2d, #1 + eor x25, x25, x1 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x1, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x1, ror 63 + ushr v1.2d, v6.2d, #20 + eor x1, x1, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x6, x6, x28 + sli v1.2d, v6.2d, #44 + eor x11, x11, x28 + eor v31.16b, v9.16b, v29.16b + eor x16, x16, x28 + eor v22.16b, v22.16b, v27.16b + eor x22, x22, x28 + ushr v6.2d, v31.2d, #44 + eor x27, x27, x28 + ushr v9.2d, v22.2d, #3 + eor x3, x3, x30 + sli v6.2d, v31.2d, #20 + eor x8, x8, x30 + sli v9.2d, v22.2d, #61 + eor x13, x13, x30 + eor v31.16b, v14.16b, v29.16b + eor x19, x19, x30 + eor v20.16b, v20.16b, v25.16b + eor x24, x24, x30 + ushr v22.2d, v31.2d, #25 + eor x5, x5, x1 + ushr v14.2d, v20.2d, #46 + eor x10, x10, x1 + sli v22.2d, v31.2d, #39 + eor x15, x15, x1 + sli v14.2d, v20.2d, #18 + eor x21, x21, x1 + eor v31.16b, v2.16b, v27.16b + eor x26, x26, x1 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x3, #63 + ushr v20.2d, v31.2d, #2 + ror x3, x8, #20 + ushr v2.2d, v12.2d, #21 + ror x8, x11, #44 + sli v20.2d, v31.2d, #62 + ror x11, x25, #3 + sli v2.2d, v12.2d, #43 + ror x25, x16, #25 + eor v31.16b, v13.16b, v28.16b + ror x16, x23, #46 + eor v19.16b, v19.16b, v29.16b + ror x23, x4, #2 + ushr v12.2d, v31.2d, #39 + ror x4, x14, #21 + ushr v13.2d, v19.2d, #56 + ror x14, x15, #39 + sli v12.2d, v31.2d, #25 + ror x15, x22, #56 + sli v13.2d, v19.2d, #8 + ror x22, x26, #8 + eor v31.16b, v23.16b, v28.16b + ror x26, x17, #23 + eor v15.16b, v15.16b, v25.16b + ror x17, x6, #37 + ushr v19.2d, v31.2d, #8 + ror x6, x27, #50 + ushr v23.2d, v15.2d, #23 + ror x27, x24, #62 + sli v19.2d, v31.2d, #56 + ror x24, x10, #9 + sli v23.2d, v15.2d, #41 + ror x10, x19, #19 + eor v31.16b, v4.16b, v29.16b + ror x19, x7, #28 + eor v24.16b, v24.16b, v29.16b + ror x7, x5, #36 + ushr v15.2d, v31.2d, #37 + ror x5, x21, #43 + ushr v4.2d, v24.2d, #50 + ror x21, x20, #49 + sli v15.2d, v31.2d, #27 + ror x20, x13, #54 + sli v4.2d, v24.2d, #14 + ror x13, x9, #58 + eor v31.16b, v21.16b, v26.16b + ror x9, x12, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x12, x4, x3 + ushr v24.2d, v31.2d, #62 + bic x1, x5, x4 + ushr v21.2d, v8.2d, #9 + bic x28, x2, x6 + sli v24.2d, v31.2d, #2 + bic x30, x3, x2 + sli v21.2d, v8.2d, #55 + eor x2, x2, x12 + eor v31.16b, v16.16b, v26.16b + eor x3, x3, x1 + eor v5.16b, v5.16b, v25.16b + bic x12, x6, x5 + ushr v8.2d, v31.2d, #19 + eor x5, x5, x28 + ushr v16.2d, v5.2d, #28 + eor x4, x4, x12 + sli v8.2d, v31.2d, #45 + eor x6, x6, x30 + sli v16.2d, v5.2d, #36 + bic x12, x9, x8 + eor v31.16b, v3.16b, v28.16b + bic x1, x10, x9 + eor v18.16b, v18.16b, v28.16b + bic x28, x7, x11 + ushr v5.2d, v31.2d, #36 + bic x30, x8, x7 + ushr v3.2d, v18.2d, #43 + eor x7, x7, x12 + sli v5.2d, v31.2d, #28 + eor x8, x8, x1 + sli v3.2d, v18.2d, #21 + bic x12, x11, x10 + eor v31.16b, v17.16b, v27.16b + eor x10, x10, x28 + eor v11.16b, v11.16b, v26.16b + eor x9, x9, x12 + ushr v18.2d, v31.2d, #49 + eor x11, x11, x30 + ushr v17.2d, v11.2d, #54 + bic x12, x14, x13 + sli v18.2d, v31.2d, #15 + bic x1, x15, x14 + sli v17.2d, v11.2d, #10 + bic x28, x0, x16 + eor v31.16b, v7.16b, v27.16b + bic x30, x13, x0 + eor v10.16b, v10.16b, v25.16b + eor x12, x0, x12 + ushr v11.2d, v31.2d, #58 + eor x13, x13, x1 + ushr v7.2d, v10.2d, #61 + bic x0, x16, x15 + sli v11.2d, v31.2d, #6 + eor x15, x15, x28 + sli v7.2d, v10.2d, #3 + eor x14, x14, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x16, x16, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x20, x19 + bic v27.16b, v4.16b, v3.16b + bic x1, x21, x20 + bic v28.16b, v0.16b, v4.16b + bic x28, x17, x22 + bic v29.16b, v1.16b, v0.16b + bic x30, x19, x17 + eor v0.16b, v0.16b, v25.16b + eor x17, x17, x0 + eor v1.16b, v1.16b, v26.16b + eor x19, x19, x1 + eor v2.16b, v2.16b, v27.16b + bic x0, x22, x21 + eor v3.16b, v3.16b, v28.16b + eor x21, x21, x28 + eor v4.16b, v4.16b, v29.16b + eor x20, x20, x0 + bic v25.16b, v7.16b, v6.16b + eor x22, x22, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x25, x24 + bic v27.16b, v9.16b, v8.16b + bic x1, x26, x25 + bic v28.16b, v5.16b, v9.16b + bic x28, x23, x27 + bic v29.16b, v6.16b, v5.16b + bic x30, x24, x23 + eor v5.16b, v5.16b, v25.16b + eor x23, x23, x0 + eor v6.16b, v6.16b, v26.16b + eor x24, x24, x1 + eor v7.16b, v7.16b, v27.16b + bic x0, x27, x26 + eor v8.16b, v8.16b, v28.16b + eor x26, x26, x28 + eor v9.16b, v9.16b, v29.16b + eor x25, x25, x0 + bic v25.16b, v12.16b, v11.16b + eor x27, x27, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake128_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake128_blocksx3_seed_neon,.-kyber_shake128_blocksx3_seed_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_SHA3_shake256_blocksx3_seed_neon_r, %object + .section .rodata + .size L_SHA3_shake256_blocksx3_seed_neon_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_shake256_blocksx3_seed_neon_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl kyber_shake256_blocksx3_seed_neon +.type kyber_shake256_blocksx3_seed_neon,@function +.align 2 +kyber_shake256_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake256_blocksx3_seed_neon +.p2align 2 +_kyber_shake256_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_SHA3_shake256_blocksx3_seed_neon_r + add x28, x28, :lo12:L_SHA3_shake256_blocksx3_seed_neon_r +#else + adrp x28, L_SHA3_shake256_blocksx3_seed_neon_r@PAGE + add x28, x28, :lo12:L_SHA3_shake256_blocksx3_seed_neon_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + movz x19, #0x8000, lsl 48 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + eor v20.16b, v20.16b, v20.16b + eor x23, x23, x23 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v16.2d, x19 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake256_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x6, x11 + eor v27.16b, v1.16b, v6.16b + eor x30, x2, x7 + eor v30.16b, v30.16b, v14.16b + eor x28, x4, x9 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x16 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x12 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x14 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x22 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x17 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x20 + sli v25.2d, v27.2d, #1 + eor x0, x0, x27 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x23 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x25 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x1, x3, x8 + eor v31.16b, v31.16b, v15.16b + eor x28, x5, x10 + eor v28.16b, v28.16b, v17.16b + eor x1, x1, x13 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x15 + eor v28.16b, v28.16b, v22.16b + eor x1, x1, x19 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x21 + ushr v26.2d, v28.2d, #63 + eor x1, x1, x24 + sli v29.2d, v30.2d, #1 + eor x28, x28, x26 + sli v26.2d, v28.2d, #1 + eor x0, x0, x1, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x1, x1, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x2, x2, x0 + eor v26.16b, v26.16b, v31.16b + eor x7, x7, x0 + eor v29.16b, v29.16b, v13.16b + eor x12, x12, x0 + eor v29.16b, v29.16b, v18.16b + eor x17, x17, x0 + eor v29.16b, v29.16b, v23.16b + eor x23, x23, x0 + ushr v30.2d, v29.2d, #63 + eor x4, x4, x1 + sli v30.2d, v29.2d, #1 + eor x9, x9, x1 + eor v27.16b, v27.16b, v30.16b + eor x14, x14, x1 + ushr v30.2d, v31.2d, #63 + eor x20, x20, x1 + sli v30.2d, v31.2d, #1 + eor x25, x25, x1 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x1, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x1, ror 63 + ushr v1.2d, v6.2d, #20 + eor x1, x1, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x6, x6, x28 + sli v1.2d, v6.2d, #44 + eor x11, x11, x28 + eor v31.16b, v9.16b, v29.16b + eor x16, x16, x28 + eor v22.16b, v22.16b, v27.16b + eor x22, x22, x28 + ushr v6.2d, v31.2d, #44 + eor x27, x27, x28 + ushr v9.2d, v22.2d, #3 + eor x3, x3, x30 + sli v6.2d, v31.2d, #20 + eor x8, x8, x30 + sli v9.2d, v22.2d, #61 + eor x13, x13, x30 + eor v31.16b, v14.16b, v29.16b + eor x19, x19, x30 + eor v20.16b, v20.16b, v25.16b + eor x24, x24, x30 + ushr v22.2d, v31.2d, #25 + eor x5, x5, x1 + ushr v14.2d, v20.2d, #46 + eor x10, x10, x1 + sli v22.2d, v31.2d, #39 + eor x15, x15, x1 + sli v14.2d, v20.2d, #18 + eor x21, x21, x1 + eor v31.16b, v2.16b, v27.16b + eor x26, x26, x1 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x3, #63 + ushr v20.2d, v31.2d, #2 + ror x3, x8, #20 + ushr v2.2d, v12.2d, #21 + ror x8, x11, #44 + sli v20.2d, v31.2d, #62 + ror x11, x25, #3 + sli v2.2d, v12.2d, #43 + ror x25, x16, #25 + eor v31.16b, v13.16b, v28.16b + ror x16, x23, #46 + eor v19.16b, v19.16b, v29.16b + ror x23, x4, #2 + ushr v12.2d, v31.2d, #39 + ror x4, x14, #21 + ushr v13.2d, v19.2d, #56 + ror x14, x15, #39 + sli v12.2d, v31.2d, #25 + ror x15, x22, #56 + sli v13.2d, v19.2d, #8 + ror x22, x26, #8 + eor v31.16b, v23.16b, v28.16b + ror x26, x17, #23 + eor v15.16b, v15.16b, v25.16b + ror x17, x6, #37 + ushr v19.2d, v31.2d, #8 + ror x6, x27, #50 + ushr v23.2d, v15.2d, #23 + ror x27, x24, #62 + sli v19.2d, v31.2d, #56 + ror x24, x10, #9 + sli v23.2d, v15.2d, #41 + ror x10, x19, #19 + eor v31.16b, v4.16b, v29.16b + ror x19, x7, #28 + eor v24.16b, v24.16b, v29.16b + ror x7, x5, #36 + ushr v15.2d, v31.2d, #37 + ror x5, x21, #43 + ushr v4.2d, v24.2d, #50 + ror x21, x20, #49 + sli v15.2d, v31.2d, #27 + ror x20, x13, #54 + sli v4.2d, v24.2d, #14 + ror x13, x9, #58 + eor v31.16b, v21.16b, v26.16b + ror x9, x12, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x12, x4, x3 + ushr v24.2d, v31.2d, #62 + bic x1, x5, x4 + ushr v21.2d, v8.2d, #9 + bic x28, x2, x6 + sli v24.2d, v31.2d, #2 + bic x30, x3, x2 + sli v21.2d, v8.2d, #55 + eor x2, x2, x12 + eor v31.16b, v16.16b, v26.16b + eor x3, x3, x1 + eor v5.16b, v5.16b, v25.16b + bic x12, x6, x5 + ushr v8.2d, v31.2d, #19 + eor x5, x5, x28 + ushr v16.2d, v5.2d, #28 + eor x4, x4, x12 + sli v8.2d, v31.2d, #45 + eor x6, x6, x30 + sli v16.2d, v5.2d, #36 + bic x12, x9, x8 + eor v31.16b, v3.16b, v28.16b + bic x1, x10, x9 + eor v18.16b, v18.16b, v28.16b + bic x28, x7, x11 + ushr v5.2d, v31.2d, #36 + bic x30, x8, x7 + ushr v3.2d, v18.2d, #43 + eor x7, x7, x12 + sli v5.2d, v31.2d, #28 + eor x8, x8, x1 + sli v3.2d, v18.2d, #21 + bic x12, x11, x10 + eor v31.16b, v17.16b, v27.16b + eor x10, x10, x28 + eor v11.16b, v11.16b, v26.16b + eor x9, x9, x12 + ushr v18.2d, v31.2d, #49 + eor x11, x11, x30 + ushr v17.2d, v11.2d, #54 + bic x12, x14, x13 + sli v18.2d, v31.2d, #15 + bic x1, x15, x14 + sli v17.2d, v11.2d, #10 + bic x28, x0, x16 + eor v31.16b, v7.16b, v27.16b + bic x30, x13, x0 + eor v10.16b, v10.16b, v25.16b + eor x12, x0, x12 + ushr v11.2d, v31.2d, #58 + eor x13, x13, x1 + ushr v7.2d, v10.2d, #61 + bic x0, x16, x15 + sli v11.2d, v31.2d, #6 + eor x15, x15, x28 + sli v7.2d, v10.2d, #3 + eor x14, x14, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x16, x16, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x20, x19 + bic v27.16b, v4.16b, v3.16b + bic x1, x21, x20 + bic v28.16b, v0.16b, v4.16b + bic x28, x17, x22 + bic v29.16b, v1.16b, v0.16b + bic x30, x19, x17 + eor v0.16b, v0.16b, v25.16b + eor x17, x17, x0 + eor v1.16b, v1.16b, v26.16b + eor x19, x19, x1 + eor v2.16b, v2.16b, v27.16b + bic x0, x22, x21 + eor v3.16b, v3.16b, v28.16b + eor x21, x21, x28 + eor v4.16b, v4.16b, v29.16b + eor x20, x20, x0 + bic v25.16b, v7.16b, v6.16b + eor x22, x22, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x25, x24 + bic v27.16b, v9.16b, v8.16b + bic x1, x26, x25 + bic v28.16b, v5.16b, v9.16b + bic x28, x23, x27 + bic v29.16b, v6.16b, v5.16b + bic x30, x24, x23 + eor v5.16b, v5.16b, v25.16b + eor x23, x23, x0 + eor v6.16b, v6.16b, v26.16b + eor x24, x24, x1 + eor v7.16b, v7.16b, v27.16b + bic x0, x27, x26 + eor v8.16b, v8.16b, v28.16b + eor x26, x26, x28 + eor v9.16b, v9.16b, v29.16b + eor x25, x25, x0 + bic v25.16b, v12.16b, v11.16b + eor x27, x27, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake256_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake256_blocksx3_seed_neon,.-kyber_shake256_blocksx3_seed_neon +#endif /* __APPLE__ */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_WC_KYBER */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c new file mode 100644 index 0000000000..3680aa63e1 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c @@ -0,0 +1,13430 @@ +/* armv8-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-kyber-asm.c + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifdef WOLFSSL_ARMASM_INLINE +static const uint16_t L_kyber_aarch64_q[] = { + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, +}; + +static const uint16_t L_kyber_aarch64_consts[] = { + 0xd01, + 0xf301, + 0x4ebf, + 0x549, + 0x5049, + 0x0, + 0x0, + 0x0, +}; + +#include + +#ifdef WOLFSSL_WC_KYBER +static const uint16_t L_kyber_aarch64_zetas[] = { + 0x8ed, + 0xa0b, + 0xb9a, + 0x714, + 0x5d5, + 0x58e, + 0x11f, + 0xca, + 0xc56, + 0x26e, + 0x629, + 0xb6, + 0x3c2, + 0x84f, + 0x73f, + 0x5bc, + 0x23d, + 0x7d4, + 0x108, + 0x17f, + 0x9c4, + 0x5b2, + 0x6bf, + 0xc7f, + 0xa58, + 0x3f9, + 0x2dc, + 0x260, + 0x6fb, + 0x19b, + 0xc34, + 0x6de, + 0x4c7, + 0x4c7, + 0x4c7, + 0x4c7, + 0x28c, + 0x28c, + 0x28c, + 0x28c, + 0xad9, + 0xad9, + 0xad9, + 0xad9, + 0x3f7, + 0x3f7, + 0x3f7, + 0x3f7, + 0x7f4, + 0x7f4, + 0x7f4, + 0x7f4, + 0x5d3, + 0x5d3, + 0x5d3, + 0x5d3, + 0xbe7, + 0xbe7, + 0xbe7, + 0xbe7, + 0x6f9, + 0x6f9, + 0x6f9, + 0x6f9, + 0x204, + 0x204, + 0x204, + 0x204, + 0xcf9, + 0xcf9, + 0xcf9, + 0xcf9, + 0xbc1, + 0xbc1, + 0xbc1, + 0xbc1, + 0xa67, + 0xa67, + 0xa67, + 0xa67, + 0x6af, + 0x6af, + 0x6af, + 0x6af, + 0x877, + 0x877, + 0x877, + 0x877, + 0x7e, + 0x7e, + 0x7e, + 0x7e, + 0x5bd, + 0x5bd, + 0x5bd, + 0x5bd, + 0x9ac, + 0x9ac, + 0x9ac, + 0x9ac, + 0xca7, + 0xca7, + 0xca7, + 0xca7, + 0xbf2, + 0xbf2, + 0xbf2, + 0xbf2, + 0x33e, + 0x33e, + 0x33e, + 0x33e, + 0x6b, + 0x6b, + 0x6b, + 0x6b, + 0x774, + 0x774, + 0x774, + 0x774, + 0xc0a, + 0xc0a, + 0xc0a, + 0xc0a, + 0x94a, + 0x94a, + 0x94a, + 0x94a, + 0xb73, + 0xb73, + 0xb73, + 0xb73, + 0x3c1, + 0x3c1, + 0x3c1, + 0x3c1, + 0x71d, + 0x71d, + 0x71d, + 0x71d, + 0xa2c, + 0xa2c, + 0xa2c, + 0xa2c, + 0x1c0, + 0x1c0, + 0x1c0, + 0x1c0, + 0x8d8, + 0x8d8, + 0x8d8, + 0x8d8, + 0x2a5, + 0x2a5, + 0x2a5, + 0x2a5, + 0x806, + 0x806, + 0x806, + 0x806, + 0x8b2, + 0x8b2, + 0x1ae, + 0x1ae, + 0x22b, + 0x22b, + 0x34b, + 0x34b, + 0x81e, + 0x81e, + 0x367, + 0x367, + 0x60e, + 0x60e, + 0x69, + 0x69, + 0x1a6, + 0x1a6, + 0x24b, + 0x24b, + 0xb1, + 0xb1, + 0xc16, + 0xc16, + 0xbde, + 0xbde, + 0xb35, + 0xb35, + 0x626, + 0x626, + 0x675, + 0x675, + 0xc0b, + 0xc0b, + 0x30a, + 0x30a, + 0x487, + 0x487, + 0xc6e, + 0xc6e, + 0x9f8, + 0x9f8, + 0x5cb, + 0x5cb, + 0xaa7, + 0xaa7, + 0x45f, + 0x45f, + 0x6cb, + 0x6cb, + 0x284, + 0x284, + 0x999, + 0x999, + 0x15d, + 0x15d, + 0x1a2, + 0x1a2, + 0x149, + 0x149, + 0xc65, + 0xc65, + 0xcb6, + 0xcb6, + 0x331, + 0x331, + 0x449, + 0x449, + 0x25b, + 0x25b, + 0x262, + 0x262, + 0x52a, + 0x52a, + 0x7fc, + 0x7fc, + 0x748, + 0x748, + 0x180, + 0x180, + 0x842, + 0x842, + 0xc79, + 0xc79, + 0x4c2, + 0x4c2, + 0x7ca, + 0x7ca, + 0x997, + 0x997, + 0xdc, + 0xdc, + 0x85e, + 0x85e, + 0x686, + 0x686, + 0x860, + 0x860, + 0x707, + 0x707, + 0x803, + 0x803, + 0x31a, + 0x31a, + 0x71b, + 0x71b, + 0x9ab, + 0x9ab, + 0x99b, + 0x99b, + 0x1de, + 0x1de, + 0xc95, + 0xc95, + 0xbcd, + 0xbcd, + 0x3e4, + 0x3e4, + 0x3df, + 0x3df, + 0x3be, + 0x3be, + 0x74d, + 0x74d, + 0x5f2, + 0x5f2, + 0x65c, + 0x65c, +}; + +static const uint16_t L_kyber_aarch64_zetas_qinv[] = { + 0xffed, + 0x7b0b, + 0x399a, + 0x314, + 0x34d5, + 0xcf8e, + 0x6e1f, + 0xbeca, + 0xae56, + 0x6c6e, + 0xf129, + 0xc2b6, + 0x29c2, + 0x54f, + 0xd43f, + 0x79bc, + 0xe93d, + 0x43d4, + 0x9908, + 0x8e7f, + 0x15c4, + 0xfbb2, + 0x53bf, + 0x997f, + 0x9258, + 0x5ef9, + 0xd6dc, + 0x2260, + 0x47fb, + 0x229b, + 0x6834, + 0xc0de, + 0xe9c7, + 0xe9c7, + 0xe9c7, + 0xe9c7, + 0xe68c, + 0xe68c, + 0xe68c, + 0xe68c, + 0x5d9, + 0x5d9, + 0x5d9, + 0x5d9, + 0x78f7, + 0x78f7, + 0x78f7, + 0x78f7, + 0xa3f4, + 0xa3f4, + 0xa3f4, + 0xa3f4, + 0x4ed3, + 0x4ed3, + 0x4ed3, + 0x4ed3, + 0x50e7, + 0x50e7, + 0x50e7, + 0x50e7, + 0x61f9, + 0x61f9, + 0x61f9, + 0x61f9, + 0xce04, + 0xce04, + 0xce04, + 0xce04, + 0x67f9, + 0x67f9, + 0x67f9, + 0x67f9, + 0x3ec1, + 0x3ec1, + 0x3ec1, + 0x3ec1, + 0xcf67, + 0xcf67, + 0xcf67, + 0xcf67, + 0x23af, + 0x23af, + 0x23af, + 0x23af, + 0xfd77, + 0xfd77, + 0xfd77, + 0xfd77, + 0x9a7e, + 0x9a7e, + 0x9a7e, + 0x9a7e, + 0x6cbd, + 0x6cbd, + 0x6cbd, + 0x6cbd, + 0x4dac, + 0x4dac, + 0x4dac, + 0x4dac, + 0x91a7, + 0x91a7, + 0x91a7, + 0x91a7, + 0xc1f2, + 0xc1f2, + 0xc1f2, + 0xc1f2, + 0xdd3e, + 0xdd3e, + 0xdd3e, + 0xdd3e, + 0x916b, + 0x916b, + 0x916b, + 0x916b, + 0x2374, + 0x2374, + 0x2374, + 0x2374, + 0x8a0a, + 0x8a0a, + 0x8a0a, + 0x8a0a, + 0x474a, + 0x474a, + 0x474a, + 0x474a, + 0x3473, + 0x3473, + 0x3473, + 0x3473, + 0x36c1, + 0x36c1, + 0x36c1, + 0x36c1, + 0x8e1d, + 0x8e1d, + 0x8e1d, + 0x8e1d, + 0xce2c, + 0xce2c, + 0xce2c, + 0xce2c, + 0x41c0, + 0x41c0, + 0x41c0, + 0x41c0, + 0x10d8, + 0x10d8, + 0x10d8, + 0x10d8, + 0xa1a5, + 0xa1a5, + 0xa1a5, + 0xa1a5, + 0xba06, + 0xba06, + 0xba06, + 0xba06, + 0xfeb2, + 0xfeb2, + 0x2bae, + 0x2bae, + 0xd32b, + 0xd32b, + 0x344b, + 0x344b, + 0x821e, + 0x821e, + 0xc867, + 0xc867, + 0x500e, + 0x500e, + 0xab69, + 0xab69, + 0x93a6, + 0x93a6, + 0x334b, + 0x334b, + 0x3b1, + 0x3b1, + 0xee16, + 0xee16, + 0xc5de, + 0xc5de, + 0x5a35, + 0x5a35, + 0x1826, + 0x1826, + 0x1575, + 0x1575, + 0x7d0b, + 0x7d0b, + 0x810a, + 0x810a, + 0x2987, + 0x2987, + 0x766e, + 0x766e, + 0x71f8, + 0x71f8, + 0xb6cb, + 0xb6cb, + 0x8fa7, + 0x8fa7, + 0x315f, + 0x315f, + 0xb7cb, + 0xb7cb, + 0x4e84, + 0x4e84, + 0x4499, + 0x4499, + 0x485d, + 0x485d, + 0xc7a2, + 0xc7a2, + 0x4c49, + 0x4c49, + 0xeb65, + 0xeb65, + 0xceb6, + 0xceb6, + 0x8631, + 0x8631, + 0x4f49, + 0x4f49, + 0x635b, + 0x635b, + 0x862, + 0x862, + 0xe32a, + 0xe32a, + 0x3bfc, + 0x3bfc, + 0x5f48, + 0x5f48, + 0x8180, + 0x8180, + 0xae42, + 0xae42, + 0xe779, + 0xe779, + 0x2ac2, + 0x2ac2, + 0xc5ca, + 0xc5ca, + 0x5e97, + 0x5e97, + 0xd4dc, + 0xd4dc, + 0x425e, + 0x425e, + 0x3886, + 0x3886, + 0x2860, + 0x2860, + 0xac07, + 0xac07, + 0xe103, + 0xe103, + 0xb11a, + 0xb11a, + 0xa81b, + 0xa81b, + 0x5aab, + 0x5aab, + 0x2a9b, + 0x2a9b, + 0xbbde, + 0xbbde, + 0x7b95, + 0x7b95, + 0xa2cd, + 0xa2cd, + 0x6fe4, + 0x6fe4, + 0xb0df, + 0xb0df, + 0x5dbe, + 0x5dbe, + 0x1e4d, + 0x1e4d, + 0xbbf2, + 0xbbf2, + 0x5a5c, + 0x5a5c, +}; + +void kyber_ntt(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q4, [x4]\n\t" + "ldr q5, [%x[r]]\n\t" + "ldr q6, [%x[r], #32]\n\t" + "ldr q7, [%x[r], #64]\n\t" + "ldr q8, [%x[r], #96]\n\t" + "ldr q9, [%x[r], #128]\n\t" + "ldr q10, [%x[r], #160]\n\t" + "ldr q11, [%x[r], #192]\n\t" + "ldr q12, [%x[r], #224]\n\t" + "ldr q13, [x1]\n\t" + "ldr q14, [x1, #32]\n\t" + "ldr q15, [x1, #64]\n\t" + "ldr q16, [x1, #96]\n\t" + "ldr q17, [x1, #128]\n\t" + "ldr q18, [x1, #160]\n\t" + "ldr q19, [x1, #192]\n\t" + "ldr q20, [x1, #224]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r]]\n\t" + "str q6, [%x[r], #32]\n\t" + "str q7, [%x[r], #64]\n\t" + "str q8, [%x[r], #96]\n\t" + "str q9, [%x[r], #128]\n\t" + "str q10, [%x[r], #160]\n\t" + "str q11, [%x[r], #192]\n\t" + "str q12, [%x[r], #224]\n\t" + "str q13, [x1]\n\t" + "str q14, [x1, #32]\n\t" + "str q15, [x1, #64]\n\t" + "str q16, [x1, #96]\n\t" + "str q17, [x1, #128]\n\t" + "str q18, [x1, #160]\n\t" + "str q19, [x1, #192]\n\t" + "str q20, [x1, #224]\n\t" + "ldr q5, [%x[r], #16]\n\t" + "ldr q6, [%x[r], #48]\n\t" + "ldr q7, [%x[r], #80]\n\t" + "ldr q8, [%x[r], #112]\n\t" + "ldr q9, [%x[r], #144]\n\t" + "ldr q10, [%x[r], #176]\n\t" + "ldr q11, [%x[r], #208]\n\t" + "ldr q12, [%x[r], #240]\n\t" + "ldr q13, [x1, #16]\n\t" + "ldr q14, [x1, #48]\n\t" + "ldr q15, [x1, #80]\n\t" + "ldr q16, [x1, #112]\n\t" + "ldr q17, [x1, #144]\n\t" + "ldr q18, [x1, #176]\n\t" + "ldr q19, [x1, #208]\n\t" + "ldr q20, [x1, #240]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r], #16]\n\t" + "str q6, [%x[r], #48]\n\t" + "str q7, [%x[r], #80]\n\t" + "str q8, [%x[r], #112]\n\t" + "str q9, [%x[r], #144]\n\t" + "str q10, [%x[r], #176]\n\t" + "str q11, [%x[r], #208]\n\t" + "str q12, [%x[r], #240]\n\t" + "str q13, [x1, #16]\n\t" + "str q14, [x1, #48]\n\t" + "str q15, [x1, #80]\n\t" + "str q16, [x1, #112]\n\t" + "str q17, [x1, #144]\n\t" + "str q18, [x1, #176]\n\t" + "str q19, [x1, #208]\n\t" + "str q20, [x1, #240]\n\t" + "ldp q5, q6, [%x[r]]\n\t" + "ldp q7, q8, [%x[r], #32]\n\t" + "ldp q9, q10, [%x[r], #64]\n\t" + "ldp q11, q12, [%x[r], #96]\n\t" + "ldp q13, q14, [%x[r], #128]\n\t" + "ldp q15, q16, [%x[r], #160]\n\t" + "ldp q17, q18, [%x[r], #192]\n\t" + "ldp q19, q20, [%x[r], #224]\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x3, #32]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q2, [x2, #80]\n\t" + "ldr q1, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q2, [x2, #112]\n\t" + "ldr q1, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q2, [x2, #144]\n\t" + "ldr q1, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q2, [x2, #176]\n\t" + "ldr q1, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q2, [x2, #336]\n\t" + "ldr q1, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q2, [x2, #368]\n\t" + "ldr q1, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q2, [x2, #400]\n\t" + "ldr q1, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q2, [x2, #432]\n\t" + "ldr q1, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [%x[r]]\n\t" + "stp q7, q8, [%x[r], #32]\n\t" + "stp q9, q10, [%x[r], #64]\n\t" + "stp q11, q12, [%x[r], #96]\n\t" + "stp q13, q14, [%x[r], #128]\n\t" + "stp q15, q16, [%x[r], #160]\n\t" + "stp q17, q18, [%x[r], #192]\n\t" + "stp q19, q20, [%x[r], #224]\n\t" + "ldp q5, q6, [x1]\n\t" + "ldp q7, q8, [x1, #32]\n\t" + "ldp q9, q10, [x1, #64]\n\t" + "ldp q11, q12, [x1, #96]\n\t" + "ldp q13, q14, [x1, #128]\n\t" + "ldp q15, q16, [x1, #160]\n\t" + "ldp q17, q18, [x1, #192]\n\t" + "ldp q19, q20, [x1, #224]\n\t" + "ldr q0, [x2, #48]\n\t" + "ldr q1, [x3, #48]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q2, [x2, #208]\n\t" + "ldr q1, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q2, [x2, #240]\n\t" + "ldr q1, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q2, [x2, #272]\n\t" + "ldr q1, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q2, [x2, #304]\n\t" + "ldr q1, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q2, [x2, #464]\n\t" + "ldr q1, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q2, [x2, #496]\n\t" + "ldr q1, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x2, #528]\n\t" + "ldr q1, [x3, #512]\n\t" + "ldr q3, [x3, #528]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #544]\n\t" + "ldr q2, [x2, #560]\n\t" + "ldr q1, [x3, #544]\n\t" + "ldr q3, [x3, #560]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [x1]\n\t" + "stp q7, q8, [x1, #32]\n\t" + "stp q9, q10, [x1, #64]\n\t" + "stp q11, q12, [x1, #96]\n\t" + "stp q13, q14, [x1, #128]\n\t" + "stp q15, q16, [x1, #160]\n\t" + "stp q17, q18, [x1, #192]\n\t" + "stp q19, q20, [x1, #224]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_zetas_inv[] = { + 0x6a5, + 0x6a5, + 0x70f, + 0x70f, + 0x5b4, + 0x5b4, + 0x943, + 0x943, + 0x922, + 0x922, + 0x91d, + 0x91d, + 0x134, + 0x134, + 0x6c, + 0x6c, + 0xb23, + 0xb23, + 0x366, + 0x366, + 0x356, + 0x356, + 0x5e6, + 0x5e6, + 0x9e7, + 0x9e7, + 0x4fe, + 0x4fe, + 0x5fa, + 0x5fa, + 0x4a1, + 0x4a1, + 0x67b, + 0x67b, + 0x4a3, + 0x4a3, + 0xc25, + 0xc25, + 0x36a, + 0x36a, + 0x537, + 0x537, + 0x83f, + 0x83f, + 0x88, + 0x88, + 0x4bf, + 0x4bf, + 0xb81, + 0xb81, + 0x5b9, + 0x5b9, + 0x505, + 0x505, + 0x7d7, + 0x7d7, + 0xa9f, + 0xa9f, + 0xaa6, + 0xaa6, + 0x8b8, + 0x8b8, + 0x9d0, + 0x9d0, + 0x4b, + 0x4b, + 0x9c, + 0x9c, + 0xbb8, + 0xbb8, + 0xb5f, + 0xb5f, + 0xba4, + 0xba4, + 0x368, + 0x368, + 0xa7d, + 0xa7d, + 0x636, + 0x636, + 0x8a2, + 0x8a2, + 0x25a, + 0x25a, + 0x736, + 0x736, + 0x309, + 0x309, + 0x93, + 0x93, + 0x87a, + 0x87a, + 0x9f7, + 0x9f7, + 0xf6, + 0xf6, + 0x68c, + 0x68c, + 0x6db, + 0x6db, + 0x1cc, + 0x1cc, + 0x123, + 0x123, + 0xeb, + 0xeb, + 0xc50, + 0xc50, + 0xab6, + 0xab6, + 0xb5b, + 0xb5b, + 0xc98, + 0xc98, + 0x6f3, + 0x6f3, + 0x99a, + 0x99a, + 0x4e3, + 0x4e3, + 0x9b6, + 0x9b6, + 0xad6, + 0xad6, + 0xb53, + 0xb53, + 0x44f, + 0x44f, + 0x4fb, + 0x4fb, + 0x4fb, + 0x4fb, + 0xa5c, + 0xa5c, + 0xa5c, + 0xa5c, + 0x429, + 0x429, + 0x429, + 0x429, + 0xb41, + 0xb41, + 0xb41, + 0xb41, + 0x2d5, + 0x2d5, + 0x2d5, + 0x2d5, + 0x5e4, + 0x5e4, + 0x5e4, + 0x5e4, + 0x940, + 0x940, + 0x940, + 0x940, + 0x18e, + 0x18e, + 0x18e, + 0x18e, + 0x3b7, + 0x3b7, + 0x3b7, + 0x3b7, + 0xf7, + 0xf7, + 0xf7, + 0xf7, + 0x58d, + 0x58d, + 0x58d, + 0x58d, + 0xc96, + 0xc96, + 0xc96, + 0xc96, + 0x9c3, + 0x9c3, + 0x9c3, + 0x9c3, + 0x10f, + 0x10f, + 0x10f, + 0x10f, + 0x5a, + 0x5a, + 0x5a, + 0x5a, + 0x355, + 0x355, + 0x355, + 0x355, + 0x744, + 0x744, + 0x744, + 0x744, + 0xc83, + 0xc83, + 0xc83, + 0xc83, + 0x48a, + 0x48a, + 0x48a, + 0x48a, + 0x652, + 0x652, + 0x652, + 0x652, + 0x29a, + 0x29a, + 0x29a, + 0x29a, + 0x140, + 0x140, + 0x140, + 0x140, + 0x8, + 0x8, + 0x8, + 0x8, + 0xafd, + 0xafd, + 0xafd, + 0xafd, + 0x608, + 0x608, + 0x608, + 0x608, + 0x11a, + 0x11a, + 0x11a, + 0x11a, + 0x72e, + 0x72e, + 0x72e, + 0x72e, + 0x50d, + 0x50d, + 0x50d, + 0x50d, + 0x90a, + 0x90a, + 0x90a, + 0x90a, + 0x228, + 0x228, + 0x228, + 0x228, + 0xa75, + 0xa75, + 0xa75, + 0xa75, + 0x83a, + 0x83a, + 0x83a, + 0x83a, + 0x623, + 0xcd, + 0xb66, + 0x606, + 0xaa1, + 0xa25, + 0x908, + 0x2a9, + 0x82, + 0x642, + 0x74f, + 0x33d, + 0xb82, + 0xbf9, + 0x52d, + 0xac4, + 0x745, + 0x5c2, + 0x4b2, + 0x93f, + 0xc4b, + 0x6d8, + 0xa93, + 0xab, + 0xc37, + 0xbe2, + 0x773, + 0x72c, + 0x5ed, + 0x167, + 0x2f6, + 0x5a1, +}; + +static const uint16_t L_kyber_aarch64_zetas_inv_qinv[] = { + 0xa5a5, + 0xa5a5, + 0x440f, + 0x440f, + 0xe1b4, + 0xe1b4, + 0xa243, + 0xa243, + 0x4f22, + 0x4f22, + 0x901d, + 0x901d, + 0x5d34, + 0x5d34, + 0x846c, + 0x846c, + 0x4423, + 0x4423, + 0xd566, + 0xd566, + 0xa556, + 0xa556, + 0x57e6, + 0x57e6, + 0x4ee7, + 0x4ee7, + 0x1efe, + 0x1efe, + 0x53fa, + 0x53fa, + 0xd7a1, + 0xd7a1, + 0xc77b, + 0xc77b, + 0xbda3, + 0xbda3, + 0x2b25, + 0x2b25, + 0xa16a, + 0xa16a, + 0x3a37, + 0x3a37, + 0xd53f, + 0xd53f, + 0x1888, + 0x1888, + 0x51bf, + 0x51bf, + 0x7e81, + 0x7e81, + 0xa0b9, + 0xa0b9, + 0xc405, + 0xc405, + 0x1cd7, + 0x1cd7, + 0xf79f, + 0xf79f, + 0x9ca6, + 0x9ca6, + 0xb0b8, + 0xb0b8, + 0x79d0, + 0x79d0, + 0x314b, + 0x314b, + 0x149c, + 0x149c, + 0xb3b8, + 0xb3b8, + 0x385f, + 0x385f, + 0xb7a4, + 0xb7a4, + 0xbb68, + 0xbb68, + 0xb17d, + 0xb17d, + 0x4836, + 0x4836, + 0xcea2, + 0xcea2, + 0x705a, + 0x705a, + 0x4936, + 0x4936, + 0x8e09, + 0x8e09, + 0x8993, + 0x8993, + 0xd67a, + 0xd67a, + 0x7ef7, + 0x7ef7, + 0x82f6, + 0x82f6, + 0xea8c, + 0xea8c, + 0xe7db, + 0xe7db, + 0xa5cc, + 0xa5cc, + 0x3a23, + 0x3a23, + 0x11eb, + 0x11eb, + 0xfc50, + 0xfc50, + 0xccb6, + 0xccb6, + 0x6c5b, + 0x6c5b, + 0x5498, + 0x5498, + 0xaff3, + 0xaff3, + 0x379a, + 0x379a, + 0x7de3, + 0x7de3, + 0xcbb6, + 0xcbb6, + 0x2cd6, + 0x2cd6, + 0xd453, + 0xd453, + 0x14f, + 0x14f, + 0x45fb, + 0x45fb, + 0x45fb, + 0x45fb, + 0x5e5c, + 0x5e5c, + 0x5e5c, + 0x5e5c, + 0xef29, + 0xef29, + 0xef29, + 0xef29, + 0xbe41, + 0xbe41, + 0xbe41, + 0xbe41, + 0x31d5, + 0x31d5, + 0x31d5, + 0x31d5, + 0x71e4, + 0x71e4, + 0x71e4, + 0x71e4, + 0xc940, + 0xc940, + 0xc940, + 0xc940, + 0xcb8e, + 0xcb8e, + 0xcb8e, + 0xcb8e, + 0xb8b7, + 0xb8b7, + 0xb8b7, + 0xb8b7, + 0x75f7, + 0x75f7, + 0x75f7, + 0x75f7, + 0xdc8d, + 0xdc8d, + 0xdc8d, + 0xdc8d, + 0x6e96, + 0x6e96, + 0x6e96, + 0x6e96, + 0x22c3, + 0x22c3, + 0x22c3, + 0x22c3, + 0x3e0f, + 0x3e0f, + 0x3e0f, + 0x3e0f, + 0x6e5a, + 0x6e5a, + 0x6e5a, + 0x6e5a, + 0xb255, + 0xb255, + 0xb255, + 0xb255, + 0x9344, + 0x9344, + 0x9344, + 0x9344, + 0x6583, + 0x6583, + 0x6583, + 0x6583, + 0x28a, + 0x28a, + 0x28a, + 0x28a, + 0xdc52, + 0xdc52, + 0xdc52, + 0xdc52, + 0x309a, + 0x309a, + 0x309a, + 0x309a, + 0xc140, + 0xc140, + 0xc140, + 0xc140, + 0x9808, + 0x9808, + 0x9808, + 0x9808, + 0x31fd, + 0x31fd, + 0x31fd, + 0x31fd, + 0x9e08, + 0x9e08, + 0x9e08, + 0x9e08, + 0xaf1a, + 0xaf1a, + 0xaf1a, + 0xaf1a, + 0xb12e, + 0xb12e, + 0xb12e, + 0xb12e, + 0x5c0d, + 0x5c0d, + 0x5c0d, + 0x5c0d, + 0x870a, + 0x870a, + 0x870a, + 0x870a, + 0xfa28, + 0xfa28, + 0xfa28, + 0xfa28, + 0x1975, + 0x1975, + 0x1975, + 0x1975, + 0x163a, + 0x163a, + 0x163a, + 0x163a, + 0x3f23, + 0x97cd, + 0xdd66, + 0xb806, + 0xdda1, + 0x2925, + 0xa108, + 0x6da9, + 0x6682, + 0xac42, + 0x44f, + 0xea3d, + 0x7182, + 0x66f9, + 0xbc2d, + 0x16c4, + 0x8645, + 0x2bc2, + 0xfab2, + 0xd63f, + 0x3d4b, + 0xed8, + 0x9393, + 0x51ab, + 0x4137, + 0x91e2, + 0x3073, + 0xcb2c, + 0xfced, + 0xc667, + 0x84f6, + 0xd8a1, +}; + +void kyber_invntt(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas_inv]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas_inv]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas_inv]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas_inv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_inv_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q8, [x4]\n\t" + "ldp q9, q10, [%x[r]]\n\t" + "ldp q11, q12, [%x[r], #32]\n\t" + "ldp q13, q14, [%x[r], #64]\n\t" + "ldp q15, q16, [%x[r], #96]\n\t" + "ldp q17, q18, [%x[r], #128]\n\t" + "ldp q19, q20, [%x[r], #160]\n\t" + "ldp q21, q22, [%x[r], #192]\n\t" + "ldp q23, q24, [%x[r], #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x2, #16]\n\t" + "ldr q2, [x3]\n\t" + "ldr q3, [x3, #16]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x2, #48]\n\t" + "ldr q2, [x3, #32]\n\t" + "ldr q3, [x3, #48]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q1, [x2, #80]\n\t" + "ldr q2, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q1, [x2, #112]\n\t" + "ldr q2, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q1, [x2, #272]\n\t" + "ldr q2, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q1, [x2, #304]\n\t" + "ldr q2, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q1, [x2, #336]\n\t" + "ldr q2, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q1, [x2, #368]\n\t" + "ldr q2, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x3, #512]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [%x[r]]\n\t" + "stp q11, q12, [%x[r], #32]\n\t" + "stp q13, q14, [%x[r], #64]\n\t" + "stp q15, q16, [%x[r], #96]\n\t" + "stp q17, q18, [%x[r], #128]\n\t" + "stp q19, q20, [%x[r], #160]\n\t" + "stp q21, q22, [%x[r], #192]\n\t" + "stp q23, q24, [%x[r], #224]\n\t" + "ldp q9, q10, [x1]\n\t" + "ldp q11, q12, [x1, #32]\n\t" + "ldp q13, q14, [x1, #64]\n\t" + "ldp q15, q16, [x1, #96]\n\t" + "ldp q17, q18, [x1, #128]\n\t" + "ldp q19, q20, [x1, #160]\n\t" + "ldp q21, q22, [x1, #192]\n\t" + "ldp q23, q24, [x1, #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q1, [x2, #144]\n\t" + "ldr q2, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q1, [x2, #176]\n\t" + "ldr q2, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q1, [x2, #208]\n\t" + "ldr q2, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q1, [x2, #240]\n\t" + "ldr q2, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q1, [x2, #400]\n\t" + "ldr q2, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q1, [x2, #432]\n\t" + "ldr q2, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q1, [x2, #464]\n\t" + "ldr q2, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q1, [x2, #496]\n\t" + "ldr q2, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #528]\n\t" + "ldr q2, [x3, #528]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [x1]\n\t" + "stp q11, q12, [x1, #32]\n\t" + "stp q13, q14, [x1, #64]\n\t" + "stp q15, q16, [x1, #96]\n\t" + "stp q17, q18, [x1, #128]\n\t" + "stp q19, q20, [x1, #160]\n\t" + "stp q21, q22, [x1, #192]\n\t" + "stp q23, q24, [x1, #224]\n\t" + "ldr q4, [x2, #544]\n\t" + "ldr q5, [x2, #560]\n\t" + "ldr q6, [x3, #544]\n\t" + "ldr q7, [x3, #560]\n\t" + "ldr q9, [%x[r]]\n\t" + "ldr q10, [%x[r], #32]\n\t" + "ldr q11, [%x[r], #64]\n\t" + "ldr q12, [%x[r], #96]\n\t" + "ldr q13, [%x[r], #128]\n\t" + "ldr q14, [%x[r], #160]\n\t" + "ldr q15, [%x[r], #192]\n\t" + "ldr q16, [%x[r], #224]\n\t" + "ldr q17, [x1]\n\t" + "ldr q18, [x1, #32]\n\t" + "ldr q19, [x1, #64]\n\t" + "ldr q20, [x1, #96]\n\t" + "ldr q21, [x1, #128]\n\t" + "ldr q22, [x1, #160]\n\t" + "ldr q23, [x1, #192]\n\t" + "ldr q24, [x1, #224]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" + "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r]]\n\t" + "str q10, [%x[r], #32]\n\t" + "str q11, [%x[r], #64]\n\t" + "str q12, [%x[r], #96]\n\t" + "str q13, [%x[r], #128]\n\t" + "str q14, [%x[r], #160]\n\t" + "str q15, [%x[r], #192]\n\t" + "str q16, [%x[r], #224]\n\t" + "str q17, [x1]\n\t" + "str q18, [x1, #32]\n\t" + "str q19, [x1, #64]\n\t" + "str q20, [x1, #96]\n\t" + "str q21, [x1, #128]\n\t" + "str q22, [x1, #160]\n\t" + "str q23, [x1, #192]\n\t" + "str q24, [x1, #224]\n\t" + "ldr q9, [%x[r], #16]\n\t" + "ldr q10, [%x[r], #48]\n\t" + "ldr q11, [%x[r], #80]\n\t" + "ldr q12, [%x[r], #112]\n\t" + "ldr q13, [%x[r], #144]\n\t" + "ldr q14, [%x[r], #176]\n\t" + "ldr q15, [%x[r], #208]\n\t" + "ldr q16, [%x[r], #240]\n\t" + "ldr q17, [x1, #16]\n\t" + "ldr q18, [x1, #48]\n\t" + "ldr q19, [x1, #80]\n\t" + "ldr q20, [x1, #112]\n\t" + "ldr q21, [x1, #144]\n\t" + "ldr q22, [x1, #176]\n\t" + "ldr q23, [x1, #208]\n\t" + "ldr q24, [x1, #240]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" + "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r], #16]\n\t" + "str q10, [%x[r], #48]\n\t" + "str q11, [%x[r], #80]\n\t" + "str q12, [%x[r], #112]\n\t" + "str q13, [%x[r], #144]\n\t" + "str q14, [%x[r], #176]\n\t" + "str q15, [%x[r], #208]\n\t" + "str q16, [%x[r], #240]\n\t" + "str q17, [x1, #16]\n\t" + "str q18, [x1, #48]\n\t" + "str q19, [x1, #80]\n\t" + "str q20, [x1, #112]\n\t" + "str q21, [x1, #144]\n\t" + "str q22, [x1, #176]\n\t" + "str q23, [x1, #208]\n\t" + "str q24, [x1, #240]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_zetas_mul[] = { + 0x8b2, + 0xf74e, + 0x1ae, + 0xfe52, + 0x22b, + 0xfdd5, + 0x34b, + 0xfcb5, + 0x81e, + 0xf7e2, + 0x367, + 0xfc99, + 0x60e, + 0xf9f2, + 0x69, + 0xff97, + 0x1a6, + 0xfe5a, + 0x24b, + 0xfdb5, + 0xb1, + 0xff4f, + 0xc16, + 0xf3ea, + 0xbde, + 0xf422, + 0xb35, + 0xf4cb, + 0x626, + 0xf9da, + 0x675, + 0xf98b, + 0xc0b, + 0xf3f5, + 0x30a, + 0xfcf6, + 0x487, + 0xfb79, + 0xc6e, + 0xf392, + 0x9f8, + 0xf608, + 0x5cb, + 0xfa35, + 0xaa7, + 0xf559, + 0x45f, + 0xfba1, + 0x6cb, + 0xf935, + 0x284, + 0xfd7c, + 0x999, + 0xf667, + 0x15d, + 0xfea3, + 0x1a2, + 0xfe5e, + 0x149, + 0xfeb7, + 0xc65, + 0xf39b, + 0xcb6, + 0xf34a, + 0x331, + 0xfccf, + 0x449, + 0xfbb7, + 0x25b, + 0xfda5, + 0x262, + 0xfd9e, + 0x52a, + 0xfad6, + 0x7fc, + 0xf804, + 0x748, + 0xf8b8, + 0x180, + 0xfe80, + 0x842, + 0xf7be, + 0xc79, + 0xf387, + 0x4c2, + 0xfb3e, + 0x7ca, + 0xf836, + 0x997, + 0xf669, + 0xdc, + 0xff24, + 0x85e, + 0xf7a2, + 0x686, + 0xf97a, + 0x860, + 0xf7a0, + 0x707, + 0xf8f9, + 0x803, + 0xf7fd, + 0x31a, + 0xfce6, + 0x71b, + 0xf8e5, + 0x9ab, + 0xf655, + 0x99b, + 0xf665, + 0x1de, + 0xfe22, + 0xc95, + 0xf36b, + 0xbcd, + 0xf433, + 0x3e4, + 0xfc1c, + 0x3df, + 0xfc21, + 0x3be, + 0xfc42, + 0x74d, + 0xf8b3, + 0x5f2, + 0xfa0e, + 0x65c, + 0xf9a4, +}; + +void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_mul]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_mul]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_mul]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_mul]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q1, [x4]\n\t" + "ldp q2, q3, [%x[a]]\n\t" + "ldp q4, q5, [%x[a], #32]\n\t" + "ldp q6, q7, [%x[a], #64]\n\t" + "ldp q8, q9, [%x[a], #96]\n\t" + "ldp q10, q11, [%x[b]]\n\t" + "ldp q12, q13, [%x[b], #32]\n\t" + "ldp q14, q15, [%x[b], #64]\n\t" + "ldp q16, q17, [%x[b], #96]\n\t" + "ldr q0, [x3]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r]]\n\t" + "ldr q0, [x3, #16]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #32]\n\t" + "ldr q0, [x3, #32]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #64]\n\t" + "ldr q0, [x3, #48]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #96]\n\t" + "ldp q2, q3, [%x[a], #128]\n\t" + "ldp q4, q5, [%x[a], #160]\n\t" + "ldp q6, q7, [%x[a], #192]\n\t" + "ldp q8, q9, [%x[a], #224]\n\t" + "ldp q10, q11, [%x[b], #128]\n\t" + "ldp q12, q13, [%x[b], #160]\n\t" + "ldp q14, q15, [%x[b], #192]\n\t" + "ldp q16, q17, [%x[b], #224]\n\t" + "ldr q0, [x3, #64]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #128]\n\t" + "ldr q0, [x3, #80]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #160]\n\t" + "ldr q0, [x3, #96]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #192]\n\t" + "ldr q0, [x3, #112]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #224]\n\t" + "ldp q2, q3, [%x[a], #256]\n\t" + "ldp q4, q5, [%x[a], #288]\n\t" + "ldp q6, q7, [%x[a], #320]\n\t" + "ldp q8, q9, [%x[a], #352]\n\t" + "ldp q10, q11, [%x[b], #256]\n\t" + "ldp q12, q13, [%x[b], #288]\n\t" + "ldp q14, q15, [%x[b], #320]\n\t" + "ldp q16, q17, [%x[b], #352]\n\t" + "ldr q0, [x3, #128]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #256]\n\t" + "ldr q0, [x3, #144]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #288]\n\t" + "ldr q0, [x3, #160]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #320]\n\t" + "ldr q0, [x3, #176]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #352]\n\t" + "ldp q2, q3, [%x[a], #384]\n\t" + "ldp q4, q5, [%x[a], #416]\n\t" + "ldp q6, q7, [%x[a], #448]\n\t" + "ldp q8, q9, [%x[a], #480]\n\t" + "ldp q10, q11, [%x[b], #384]\n\t" + "ldp q12, q13, [%x[b], #416]\n\t" + "ldp q14, q15, [%x[b], #448]\n\t" + "ldp q16, q17, [%x[b], #480]\n\t" + "ldr q0, [x3, #192]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #384]\n\t" + "ldr q0, [x3, #208]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #416]\n\t" + "ldr q0, [x3, #224]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #448]\n\t" + "ldr q0, [x3, #240]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #480]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "cc" + ); +} + +void kyber_basemul_mont_add(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_mul]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_mul]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_mul]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_mul]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q1, [x4]\n\t" + "ldp q2, q3, [%x[a]]\n\t" + "ldp q4, q5, [%x[a], #32]\n\t" + "ldp q6, q7, [%x[a], #64]\n\t" + "ldp q8, q9, [%x[a], #96]\n\t" + "ldp q10, q11, [%x[b]]\n\t" + "ldp q12, q13, [%x[b], #32]\n\t" + "ldp q14, q15, [%x[b], #64]\n\t" + "ldp q16, q17, [%x[b], #96]\n\t" + "ldp q28, q29, [%x[r]]\n\t" + "ldr q0, [x3]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r]]\n\t" + "ldp q28, q29, [%x[r], #32]\n\t" + "ldr q0, [x3, #16]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #32]\n\t" + "ldp q28, q29, [%x[r], #64]\n\t" + "ldr q0, [x3, #32]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #64]\n\t" + "ldp q28, q29, [%x[r], #96]\n\t" + "ldr q0, [x3, #48]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #96]\n\t" + "ldp q2, q3, [%x[a], #128]\n\t" + "ldp q4, q5, [%x[a], #160]\n\t" + "ldp q6, q7, [%x[a], #192]\n\t" + "ldp q8, q9, [%x[a], #224]\n\t" + "ldp q10, q11, [%x[b], #128]\n\t" + "ldp q12, q13, [%x[b], #160]\n\t" + "ldp q14, q15, [%x[b], #192]\n\t" + "ldp q16, q17, [%x[b], #224]\n\t" + "ldp q28, q29, [%x[r], #128]\n\t" + "ldr q0, [x3, #64]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #128]\n\t" + "ldp q28, q29, [%x[r], #160]\n\t" + "ldr q0, [x3, #80]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #160]\n\t" + "ldp q28, q29, [%x[r], #192]\n\t" + "ldr q0, [x3, #96]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #192]\n\t" + "ldp q28, q29, [%x[r], #224]\n\t" + "ldr q0, [x3, #112]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #224]\n\t" + "ldp q2, q3, [%x[a], #256]\n\t" + "ldp q4, q5, [%x[a], #288]\n\t" + "ldp q6, q7, [%x[a], #320]\n\t" + "ldp q8, q9, [%x[a], #352]\n\t" + "ldp q10, q11, [%x[b], #256]\n\t" + "ldp q12, q13, [%x[b], #288]\n\t" + "ldp q14, q15, [%x[b], #320]\n\t" + "ldp q16, q17, [%x[b], #352]\n\t" + "ldp q28, q29, [%x[r], #256]\n\t" + "ldr q0, [x3, #128]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #256]\n\t" + "ldp q28, q29, [%x[r], #288]\n\t" + "ldr q0, [x3, #144]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #288]\n\t" + "ldp q28, q29, [%x[r], #320]\n\t" + "ldr q0, [x3, #160]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #320]\n\t" + "ldp q28, q29, [%x[r], #352]\n\t" + "ldr q0, [x3, #176]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #352]\n\t" + "ldp q2, q3, [%x[a], #384]\n\t" + "ldp q4, q5, [%x[a], #416]\n\t" + "ldp q6, q7, [%x[a], #448]\n\t" + "ldp q8, q9, [%x[a], #480]\n\t" + "ldp q10, q11, [%x[b], #384]\n\t" + "ldp q12, q13, [%x[b], #416]\n\t" + "ldp q14, q15, [%x[b], #448]\n\t" + "ldp q16, q17, [%x[b], #480]\n\t" + "ldp q28, q29, [%x[r], #384]\n\t" + "ldr q0, [x3, #192]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #384]\n\t" + "ldp q28, q29, [%x[r], #416]\n\t" + "ldr q0, [x3, #208]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #416]\n\t" + "ldp q28, q29, [%x[r], #448]\n\t" + "ldr q0, [x3, #224]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #448]\n\t" + "ldp q28, q29, [%x[r], #480]\n\t" + "ldr q0, [x3, #240]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #480]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "cc" + ); +} + +void kyber_csubq_neon(sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x1, %[L_kyber_aarch64_q]\n\t" + "add x1, x1, :lo12:%[L_kyber_aarch64_q]\n\t" +#else + "adrp x1, %[L_kyber_aarch64_q]@PAGE\n\t" + "add x1, x1, %[L_kyber_aarch64_q]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q20, [x1]\n\t" + "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "sub v0.8h, v0.8h, v20.8h\n\t" + "sub v1.8h, v1.8h, v20.8h\n\t" + "sub v2.8h, v2.8h, v20.8h\n\t" + "sub v3.8h, v3.8h, v20.8h\n\t" + "sub v4.8h, v4.8h, v20.8h\n\t" + "sub v5.8h, v5.8h, v20.8h\n\t" + "sub v6.8h, v6.8h, v20.8h\n\t" + "sub v7.8h, v7.8h, v20.8h\n\t" + "sub v8.8h, v8.8h, v20.8h\n\t" + "sub v9.8h, v9.8h, v20.8h\n\t" + "sub v10.8h, v10.8h, v20.8h\n\t" + "sub v11.8h, v11.8h, v20.8h\n\t" + "sub v12.8h, v12.8h, v20.8h\n\t" + "sub v13.8h, v13.8h, v20.8h\n\t" + "sub v14.8h, v14.8h, v20.8h\n\t" + "sub v15.8h, v15.8h, v20.8h\n\t" + "sshr v16.8h, v0.8h, #15\n\t" + "sshr v17.8h, v1.8h, #15\n\t" + "sshr v18.8h, v2.8h, #15\n\t" + "sshr v19.8h, v3.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v0.8h, v0.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "sshr v16.8h, v4.8h, #15\n\t" + "sshr v17.8h, v5.8h, #15\n\t" + "sshr v18.8h, v6.8h, #15\n\t" + "sshr v19.8h, v7.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v4.8h, v4.8h, v16.8h\n\t" + "add v5.8h, v5.8h, v17.8h\n\t" + "add v6.8h, v6.8h, v18.8h\n\t" + "add v7.8h, v7.8h, v19.8h\n\t" + "sshr v16.8h, v8.8h, #15\n\t" + "sshr v17.8h, v9.8h, #15\n\t" + "sshr v18.8h, v10.8h, #15\n\t" + "sshr v19.8h, v11.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "sshr v16.8h, v12.8h, #15\n\t" + "sshr v17.8h, v13.8h, #15\n\t" + "sshr v18.8h, v14.8h, #15\n\t" + "sshr v19.8h, v15.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v17.8h\n\t" + "add v14.8h, v14.8h, v18.8h\n\t" + "add v15.8h, v15.8h, v19.8h\n\t" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "sub v0.8h, v0.8h, v20.8h\n\t" + "sub v1.8h, v1.8h, v20.8h\n\t" + "sub v2.8h, v2.8h, v20.8h\n\t" + "sub v3.8h, v3.8h, v20.8h\n\t" + "sub v4.8h, v4.8h, v20.8h\n\t" + "sub v5.8h, v5.8h, v20.8h\n\t" + "sub v6.8h, v6.8h, v20.8h\n\t" + "sub v7.8h, v7.8h, v20.8h\n\t" + "sub v8.8h, v8.8h, v20.8h\n\t" + "sub v9.8h, v9.8h, v20.8h\n\t" + "sub v10.8h, v10.8h, v20.8h\n\t" + "sub v11.8h, v11.8h, v20.8h\n\t" + "sub v12.8h, v12.8h, v20.8h\n\t" + "sub v13.8h, v13.8h, v20.8h\n\t" + "sub v14.8h, v14.8h, v20.8h\n\t" + "sub v15.8h, v15.8h, v20.8h\n\t" + "sshr v16.8h, v0.8h, #15\n\t" + "sshr v17.8h, v1.8h, #15\n\t" + "sshr v18.8h, v2.8h, #15\n\t" + "sshr v19.8h, v3.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v0.8h, v0.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "sshr v16.8h, v4.8h, #15\n\t" + "sshr v17.8h, v5.8h, #15\n\t" + "sshr v18.8h, v6.8h, #15\n\t" + "sshr v19.8h, v7.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v4.8h, v4.8h, v16.8h\n\t" + "add v5.8h, v5.8h, v17.8h\n\t" + "add v6.8h, v6.8h, v18.8h\n\t" + "add v7.8h, v7.8h, v19.8h\n\t" + "sshr v16.8h, v8.8h, #15\n\t" + "sshr v17.8h, v9.8h, #15\n\t" + "sshr v18.8h, v10.8h, #15\n\t" + "sshr v19.8h, v11.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "sshr v16.8h, v12.8h, #15\n\t" + "sshr v17.8h, v13.8h, #15\n\t" + "sshr v18.8h, v14.8h, #15\n\t" + "sshr v19.8h, v15.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v17.8h\n\t" + "add v14.8h, v14.8h, v18.8h\n\t" + "add v15.8h, v15.8h, v19.8h\n\t" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "cc" + ); +} + +void kyber_add_reduce(sword16* r, const sword16* a) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_consts]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +void kyber_add3_reduce(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_consts]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x3]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "cc" + ); +} + +void kyber_rsub_reduce(sword16* r, const sword16* a) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_consts]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +void kyber_to_mont(sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x1, %[L_kyber_aarch64_consts]\n\t" + "add x1, x1, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x1, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x1, x1, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x1]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_to_msg_neon_low[] = { + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, +}; + +static const uint16_t L_kyber_aarch64_to_msg_neon_high[] = { + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, +}; + +static const uint16_t L_kyber_aarch64_to_msg_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +void kyber_to_msg_neon(byte* msg, sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_to_msg_neon_low]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_to_msg_neon_low]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_to_msg_neon_low]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_to_msg_neon_low]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_to_msg_neon_high]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_to_msg_neon_high]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_to_msg_neon_high]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_to_msg_neon_high]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_to_msg_neon_bits]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_to_msg_neon_bits]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_to_msg_neon_bits]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_to_msg_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "ldr q26, [x4]\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + : [msg] "+r" (msg), [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits) + : "memory", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_from_msg_neon_q1half[] = { + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, +}; + +static const uint8_t L_kyber_aarch64_from_msg_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +void kyber_from_msg_neon(sword16* p, const byte* msg) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_from_msg_neon_q1half]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_from_msg_neon_q1half]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_from_msg_neon_q1half]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_from_msg_neon_q1half]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_from_msg_neon_bits]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_from_msg_neon_bits]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_from_msg_neon_bits]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_from_msg_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ld1 {v2.16b, v3.16b}, [%x[msg]]\n\t" + "ldr q1, [x2]\n\t" + "ldr q0, [x3]\n\t" + "dup v4.8b, v2.b[0]\n\t" + "dup v5.8b, v2.b[1]\n\t" + "dup v6.8b, v2.b[2]\n\t" + "dup v7.8b, v2.b[3]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[4]\n\t" + "dup v5.8b, v2.b[5]\n\t" + "dup v6.8b, v2.b[6]\n\t" + "dup v7.8b, v2.b[7]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[8]\n\t" + "dup v5.8b, v2.b[9]\n\t" + "dup v6.8b, v2.b[10]\n\t" + "dup v7.8b, v2.b[11]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[12]\n\t" + "dup v5.8b, v2.b[13]\n\t" + "dup v6.8b, v2.b[14]\n\t" + "dup v7.8b, v2.b[15]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[0]\n\t" + "dup v5.8b, v3.b[1]\n\t" + "dup v6.8b, v3.b[2]\n\t" + "dup v7.8b, v3.b[3]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[4]\n\t" + "dup v5.8b, v3.b[5]\n\t" + "dup v6.8b, v3.b[6]\n\t" + "dup v7.8b, v3.b[7]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[8]\n\t" + "dup v5.8b, v3.b[9]\n\t" + "dup v6.8b, v3.b[10]\n\t" + "dup v7.8b, v3.b[11]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[12]\n\t" + "dup v5.8b, v3.b[13]\n\t" + "dup v6.8b, v3.b[14]\n\t" + "dup v7.8b, v3.b[15]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p), [msg] "+r" (msg) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits) + : "memory", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "cc" + ); +} + +int kyber_cmp_neon(const byte* a, const byte* b, int sz) +{ + __asm__ __volatile__ ( + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v8.16b, v0.16b, v4.16b\n\t" + "eor v9.16b, v1.16b, v5.16b\n\t" + "eor v10.16b, v2.16b, v6.16b\n\t" + "eor v11.16b, v3.16b, v7.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "subs %w[sz], %w[sz], #0x300\n\t" + "beq L_kyber_aarch64_cmp_neon_done_%=\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "subs %w[sz], %w[sz], #0x140\n\t" + "beq L_kyber_aarch64_cmp_neon_done_%=\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld2 {v0.16b, v1.16b}, [%x[a]]\n\t" + "ld2 {v4.16b, v5.16b}, [%x[b]]\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "\n" + "L_kyber_aarch64_cmp_neon_done_%=: \n\t" + "orr v8.16b, v8.16b, v9.16b\n\t" + "orr v10.16b, v10.16b, v11.16b\n\t" + "orr v8.16b, v8.16b, v10.16b\n\t" + "ins v9.b[0], v8.b[1]\n\t" + "orr v8.16b, v8.16b, v9.16b\n\t" + "mov x0, v8.d[0]\n\t" + "subs x0, x0, xzr\n\t" + "csetm w0, ne\n\t" + : [a] "+r" (a), [b] "+r" (b), [sz] "+r" (sz) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "cc" + ); + return (uint32_t)(size_t)a; +} + +static const uint16_t L_kyber_aarch64_rej_uniform_neon_mask[] = { + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, +}; + +static const uint16_t L_kyber_aarch64_rej_uniform_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +static const uint8_t L_kyber_aarch64_rej_uniform_neon_indeces[] = { + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, +}; + +unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, unsigned int rLen) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_rej_uniform_neon_mask]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_rej_uniform_neon_mask]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_rej_uniform_neon_mask]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_rej_uniform_neon_mask]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x5, %[L_kyber_aarch64_q]\n\t" + "add x5, x5, :lo12:%[L_kyber_aarch64_q]\n\t" +#else + "adrp x5, %[L_kyber_aarch64_q]@PAGE\n\t" + "add x5, x5, %[L_kyber_aarch64_q]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x6, %[L_kyber_aarch64_rej_uniform_neon_bits]\n\t" + "add x6, x6, :lo12:%[L_kyber_aarch64_rej_uniform_neon_bits]\n\t" +#else + "adrp x6, %[L_kyber_aarch64_rej_uniform_neon_bits]@PAGE\n\t" + "add x6, x6, %[L_kyber_aarch64_rej_uniform_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]\n\t" + "add x7, x7, :lo12:%[L_kyber_aarch64_rej_uniform_neon_indeces]\n\t" +#else + "adrp x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]@PAGE\n\t" + "add x7, x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "eor v1.16b, v1.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "mov x13, #0xd01\n\t" + "ldr q0, [x4]\n\t" + "ldr q3, [x5]\n\t" + "ldr q2, [x6]\n\t" + "subs wzr, %w[len], #0\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs wzr, %w[len], #16\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_16_%=: \n\t" + "ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t" + "zip1 v4.16b, v4.16b, v1.16b\n\t" + "zip1 v5.16b, v5.16b, v1.16b\n\t" + "zip1 v6.16b, v6.16b, v1.16b\n\t" + "shl v7.8h, v5.8h, #8\n\t" + "ushr v8.8h, v5.8h, #4\n\t" + "shl v6.8h, v6.8h, #4\n\t" + "orr v4.16b, v4.16b, v7.16b\n\t" + "orr v5.16b, v8.16b, v6.16b\n\t" + "and v7.16b, v4.16b, v0.16b\n\t" + "and v8.16b, v5.16b, v0.16b\n\t" + "zip1 v4.8h, v7.8h, v8.8h\n\t" + "zip2 v5.8h, v7.8h, v8.8h\n\t" + "cmgt v7.8h, v3.8h, v4.8h\n\t" + "cmgt v8.8h, v3.8h, v5.8h\n\t" + "ushr v12.8h, v7.8h, #15\n\t" + "ushr v13.8h, v8.8h, #15\n\t" + "addv h12, v12.8h\n\t" + "addv h13, v13.8h\n\t" + "mov x10, v12.d[0]\n\t" + "mov x11, v13.d[0]\n\t" + "and v10.16b, v7.16b, v2.16b\n\t" + "and v11.16b, v8.16b, v2.16b\n\t" + "addv h10, v10.8h\n\t" + "addv h11, v11.8h\n\t" + "mov w8, v10.s[0]\n\t" + "mov w9, v11.s[0]\n\t" + "lsl w8, w8, #4\n\t" + "lsl w9, w9, #4\n\t" + "ldr q10, [x7, x8]\n\t" + "ldr q11, [x7, x9]\n\t" + "tbl v7.16b, {v4.16b}, v10.16b\n\t" + "tbl v8.16b, {v5.16b}, v11.16b\n\t" + "str q7, [%x[p]]\n\t" + "add %x[p], %x[p], x10, lsl 1\n\t" + "add x12, x12, x10\n\t" + "str q8, [%x[p]]\n\t" + "add %x[p], %x[p], x11, lsl 1\n\t" + "add x12, x12, x11\n\t" + "subs %w[rLen], %w[rLen], #24\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "sub w10, %w[len], w12\n\t" + "subs x10, x10, #16\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_16_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_4_%=: \n\t" + "subs w10, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs x10, x10, #4\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=\n\t" + "ldr x4, [%x[r]], #6\n\t" + "lsr x5, x4, #12\n\t" + "lsr x6, x4, #24\n\t" + "lsr x7, x4, #36\n\t" + "and x4, x4, #0xfff\n\t" + "and x5, x5, #0xfff\n\t" + "and x6, x6, #0xfff\n\t" + "and x7, x7, #0xfff\n\t" + "strh w4, [%x[p]]\n\t" + "subs xzr, x4, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w5, [%x[p]]\n\t" + "subs xzr, x5, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w6, [%x[p]]\n\t" + "subs xzr, x6, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w7, [%x[p]]\n\t" + "subs xzr, x7, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs %w[rLen], %w[rLen], #6\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=: \n\t" + "ldr x4, [%x[r]], #6\n\t" + "lsr x5, x4, #12\n\t" + "lsr x6, x4, #24\n\t" + "lsr x7, x4, #36\n\t" + "and x4, x4, #0xfff\n\t" + "and x5, x5, #0xfff\n\t" + "and x6, x6, #0xfff\n\t" + "and x7, x7, #0xfff\n\t" + "strh w4, [%x[p]]\n\t" + "subs xzr, x4, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w5, [%x[p]]\n\t" + "subs xzr, x5, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w6, [%x[p]]\n\t" + "subs xzr, x6, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w7, [%x[p]]\n\t" + "subs xzr, x7, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs %w[rLen], %w[rLen], #6\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_done_%=: \n\t" + "mov x0, x12\n\t" + : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "cc" + ); + return (uint32_t)(size_t)p; +} + +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +static const uint64_t L_SHA3_transform_blocksx3_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_sha3_blocksx3_neon(word64* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_SHA3_transform_blocksx3_neon_r]\n\t" + "add x27, x27, :lo12:%[L_SHA3_transform_blocksx3_neon_r]\n\t" +#else + "adrp x27, %[L_SHA3_transform_blocksx3_neon_r]@PAGE\n\t" + "add x27, x27, %[L_SHA3_transform_blocksx3_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "ld1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "ld1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x5, x10\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x1, x6\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x3, x8\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x11\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x13\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x16\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x19\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x26\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x22\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x24\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor x27, x2, x7\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x4, x9\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor x27, x27, x12\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x14\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor x27, x27, x17\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x20\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor x27, x27, x23\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x25\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor x27, x27, x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x1, x1, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x6, x6, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x11, x11, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x16, x16, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x22, x22, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x3, x3, x27\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x8, x8, x27\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x13, x13, x27\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x19, x19, x27\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x24, x24, x27\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr x27, [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, x27, ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x5, x5, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x10, x10, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x15, x15, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x21, x21, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x26, x26, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x2, x2, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x7, x7, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x12, x12, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x17, x17, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x23, x23, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x4, x4, x27\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x9, x9, x27\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x14, x14, x27\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x20, x20, x27\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x2, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x2, x7, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x7, x10, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x10, x24, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x24, x15, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x15, x22, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x22, x3, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x3, x13, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x13, x14, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x14, x21, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x21, x25, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x25, x16, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x16, x5, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x5, x26, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x26, x23, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x23, x9, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x9, x17, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x17, x6, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x6, x4, #36\n\t" + "ror x4, x20, #43\n\t" + "ror x20, x19, #49\n\t" + "ror x19, x12, #54\n\t" + "ror x12, x8, #58\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix Base */ + "bic x11, x3, x2\n\t" + "bic x27, x4, x3\n\t" + "bic x28, x1, x5\n\t" + "bic x30, x2, x1\n\t" + "eor x1, x1, x11\n\t" + "eor x2, x2, x27\n\t" + "bic x11, x5, x4\n\t" + "eor x4, x4, x28\n\t" + "eor x3, x3, x11\n\t" + "eor x5, x5, x30\n\t" + "bic x11, x8, x7\n\t" + "bic x27, x9, x8\n\t" + "bic x28, x6, x10\n\t" + "bic x30, x7, x6\n\t" + "eor x6, x6, x11\n\t" + "eor x7, x7, x27\n\t" + "bic x11, x10, x9\n\t" + "eor x9, x9, x28\n\t" + "eor x8, x8, x11\n\t" + "eor x10, x10, x30\n\t" + "bic x11, x13, x12\n\t" + "bic x27, x14, x13\n\t" + "bic x28, %x[state], x15\n\t" + "bic x30, x12, %x[state]\n\t" + "eor x11, %x[state], x11\n\t" + "eor x12, x12, x27\n\t" + "bic %x[state], x15, x14\n\t" + "eor x14, x14, x28\n\t" + "eor x13, x13, %x[state]\n\t" + "eor x15, x15, x30\n\t" + "bic %x[state], x19, x17\n\t" + "bic x27, x20, x19\n\t" + "bic x28, x16, x21\n\t" + "bic x30, x17, x16\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x17, x17, x27\n\t" + "bic %x[state], x21, x20\n\t" + "eor x20, x20, x28\n\t" + "eor x19, x19, %x[state]\n\t" + "eor x21, x21, x30\n\t" + "bic %x[state], x24, x23\n\t" + "bic x27, x25, x24\n\t" + "bic x28, x22, x26\n\t" + "bic x30, x23, x22\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x23, x23, x27\n\t" + "bic %x[state], x26, x25\n\t" + "eor x25, x25, x28\n\t" + "eor x24, x24, %x[state]\n\t" + "eor x26, x26, x30\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_SHA3_transform_blocksx3_neon_r] "S" (L_SHA3_transform_blocksx3_neon_r) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +static const uint64_t L_SHA3_shake128_blocksx3_seed_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]\n\t" + "add x28, x28, :lo12:%[L_SHA3_shake128_blocksx3_seed_neon_r]\n\t" +#else + "adrp x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]@PAGE\n\t" + "add x28, x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "eor v16.16b, v16.16b, v16.16b\n\t" + "eor x19, x19, x19\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "movz x23, #0x8000, lsl 48\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v20.2d, x23\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x2, x7\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x4, x9\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x12\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x17\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x20\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x27\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x23\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x25\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x5, x10\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x15\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x21\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x26\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x2, x2, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x7, x7, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x12, x12, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x17, x17, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x23, x23, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x4, x4, %x[seed]\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x9, x9, %x[seed]\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x14, x14, %x[seed]\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x20, x20, %x[seed]\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x25, x25, %x[seed]\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x6, x6, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x11, x11, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x16, x16, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x22, x22, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x27, x27, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x3, x3, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x8, x8, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x13, x13, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x19, x19, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x24, x24, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x5, x5, %x[seed]\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x10, x10, %x[seed]\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x15, x15, %x[seed]\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x21, x21, %x[seed]\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x3, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x3, x8, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x8, x11, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x11, x25, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x25, x16, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x16, x23, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x23, x4, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x4, x14, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x14, x15, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x15, x22, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x22, x26, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x26, x17, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x17, x6, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x6, x27, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x27, x24, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x24, x10, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x10, x19, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x19, x7, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x7, x5, #36\n\t" + "ror x5, x21, #43\n\t" + "ror x21, x20, #49\n\t" + "ror x20, x13, #54\n\t" + "ror x13, x9, #58\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "bic x12, x4, x3\n\t" + "bic %x[seed], x5, x4\n\t" + "bic x28, x2, x6\n\t" + "bic x30, x3, x2\n\t" + "eor x2, x2, x12\n\t" + "eor x3, x3, %x[seed]\n\t" + "bic x12, x6, x5\n\t" + "eor x5, x5, x28\n\t" + "eor x4, x4, x12\n\t" + "eor x6, x6, x30\n\t" + "bic x12, x9, x8\n\t" + "bic %x[seed], x10, x9\n\t" + "bic x28, x7, x11\n\t" + "bic x30, x8, x7\n\t" + "eor x7, x7, x12\n\t" + "eor x8, x8, %x[seed]\n\t" + "bic x12, x11, x10\n\t" + "eor x10, x10, x28\n\t" + "eor x9, x9, x12\n\t" + "eor x11, x11, x30\n\t" + "bic x12, x14, x13\n\t" + "bic %x[seed], x15, x14\n\t" + "bic x28, %x[state], x16\n\t" + "bic x30, x13, %x[state]\n\t" + "eor x12, %x[state], x12\n\t" + "eor x13, x13, %x[seed]\n\t" + "bic %x[state], x16, x15\n\t" + "eor x15, x15, x28\n\t" + "eor x14, x14, %x[state]\n\t" + "eor x16, x16, x30\n\t" + "bic %x[state], x20, x19\n\t" + "bic %x[seed], x21, x20\n\t" + "bic x28, x17, x22\n\t" + "bic x30, x19, x17\n\t" + "eor x17, x17, %x[state]\n\t" + "eor x19, x19, %x[seed]\n\t" + "bic %x[state], x22, x21\n\t" + "eor x21, x21, x28\n\t" + "eor x20, x20, %x[state]\n\t" + "eor x22, x22, x30\n\t" + "bic %x[state], x25, x24\n\t" + "bic %x[seed], x26, x25\n\t" + "bic x28, x23, x27\n\t" + "bic x30, x24, x23\n\t" + "eor x23, x23, %x[state]\n\t" + "eor x24, x24, %x[seed]\n\t" + "bic %x[state], x27, x26\n\t" + "eor x26, x26, x28\n\t" + "eor x25, x25, %x[state]\n\t" + "eor x27, x27, x30\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_SHA3_shake128_blocksx3_seed_neon_r] "S" (L_SHA3_shake128_blocksx3_seed_neon_r) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +static const uint64_t L_SHA3_shake256_blocksx3_seed_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]\n\t" + "add x28, x28, :lo12:%[L_SHA3_shake256_blocksx3_seed_neon_r]\n\t" +#else + "adrp x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]@PAGE\n\t" + "add x28, x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "movz x19, #0x8000, lsl 48\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "eor x23, x23, x23\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v16.2d, x19\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x2, x7\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x4, x9\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x12\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x17\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x20\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x27\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x23\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x25\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x5, x10\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x15\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x21\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x26\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x2, x2, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x7, x7, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x12, x12, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x17, x17, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x23, x23, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x4, x4, %x[seed]\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x9, x9, %x[seed]\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x14, x14, %x[seed]\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x20, x20, %x[seed]\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x25, x25, %x[seed]\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x6, x6, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x11, x11, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x16, x16, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x22, x22, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x27, x27, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x3, x3, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x8, x8, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x13, x13, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x19, x19, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x24, x24, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x5, x5, %x[seed]\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x10, x10, %x[seed]\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x15, x15, %x[seed]\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x21, x21, %x[seed]\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x3, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x3, x8, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x8, x11, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x11, x25, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x25, x16, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x16, x23, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x23, x4, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x4, x14, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x14, x15, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x15, x22, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x22, x26, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x26, x17, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x17, x6, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x6, x27, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x27, x24, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x24, x10, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x10, x19, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x19, x7, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x7, x5, #36\n\t" + "ror x5, x21, #43\n\t" + "ror x21, x20, #49\n\t" + "ror x20, x13, #54\n\t" + "ror x13, x9, #58\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "bic x12, x4, x3\n\t" + "bic %x[seed], x5, x4\n\t" + "bic x28, x2, x6\n\t" + "bic x30, x3, x2\n\t" + "eor x2, x2, x12\n\t" + "eor x3, x3, %x[seed]\n\t" + "bic x12, x6, x5\n\t" + "eor x5, x5, x28\n\t" + "eor x4, x4, x12\n\t" + "eor x6, x6, x30\n\t" + "bic x12, x9, x8\n\t" + "bic %x[seed], x10, x9\n\t" + "bic x28, x7, x11\n\t" + "bic x30, x8, x7\n\t" + "eor x7, x7, x12\n\t" + "eor x8, x8, %x[seed]\n\t" + "bic x12, x11, x10\n\t" + "eor x10, x10, x28\n\t" + "eor x9, x9, x12\n\t" + "eor x11, x11, x30\n\t" + "bic x12, x14, x13\n\t" + "bic %x[seed], x15, x14\n\t" + "bic x28, %x[state], x16\n\t" + "bic x30, x13, %x[state]\n\t" + "eor x12, %x[state], x12\n\t" + "eor x13, x13, %x[seed]\n\t" + "bic %x[state], x16, x15\n\t" + "eor x15, x15, x28\n\t" + "eor x14, x14, %x[state]\n\t" + "eor x16, x16, x30\n\t" + "bic %x[state], x20, x19\n\t" + "bic %x[seed], x21, x20\n\t" + "bic x28, x17, x22\n\t" + "bic x30, x19, x17\n\t" + "eor x17, x17, %x[state]\n\t" + "eor x19, x19, %x[seed]\n\t" + "bic %x[state], x22, x21\n\t" + "eor x21, x21, x28\n\t" + "eor x20, x20, %x[state]\n\t" + "eor x22, x22, x30\n\t" + "bic %x[state], x25, x24\n\t" + "bic %x[seed], x26, x25\n\t" + "bic x28, x23, x27\n\t" + "bic x30, x24, x23\n\t" + "eor x23, x23, %x[state]\n\t" + "eor x24, x24, %x[seed]\n\t" + "bic %x[state], x27, x26\n\t" + "eor x26, x26, x28\n\t" + "eor x25, x25, %x[state]\n\t" + "eor x27, x27, x30\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_SHA3_shake256_blocksx3_seed_neon_r] "S" (L_SHA3_shake256_blocksx3_seed_neon_r) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +#else +static const uint64_t L_SHA3_transform_blocksx3_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_sha3_blocksx3_neon(word64* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_SHA3_transform_blocksx3_neon_r]\n\t" + "add x27, x27, :lo12:%[L_SHA3_transform_blocksx3_neon_r]\n\t" +#else + "adrp x27, %[L_SHA3_transform_blocksx3_neon_r]@PAGE\n\t" + "add x27, x27, %[L_SHA3_transform_blocksx3_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "ld1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "ld1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x5, x10\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x1, x6\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x3, x8\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x11\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x13\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x16\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x19\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x26\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x22\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x24\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor x27, x2, x7\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor x27, x27, x12\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor x27, x27, x17\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor x27, x27, x23\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x25\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor x27, x27, x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x6, x6, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x11, x11, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x16, x16, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x22, x22, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x3, x3, x27\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x8, x8, x27\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x13, x13, x27\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x19, x19, x27\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x24, x24, x27\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr x27, [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, x27, ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x5, x5, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x10, x10, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x15, x15, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x21, x21, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x26, x26, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x2, x2, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x7, x7, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x12, x12, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x17, x17, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x23, x23, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x4, x4, x27\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x9, x9, x27\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x14, x14, x27\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x20, x20, x27\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x2, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x2, x7, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x7, x10, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x10, x24, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x24, x15, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x15, x22, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x22, x3, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x3, x13, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x13, x14, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x14, x21, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x21, x25, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x25, x16, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x16, x5, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x5, x26, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x26, x23, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x23, x9, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x9, x17, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x17, x6, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x6, x4, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x4, x20, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x20, x19, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x19, x12, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x12, x8, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x11, x3, x2\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic x27, x4, x3\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x1, x5\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x2, x1\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x1, x1, x11\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x2, x2, x27\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x11, x5, x4\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x4, x4, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x3, x3, x11\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x5, x5, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x11, x8, x7\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic x27, x9, x8\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x6, x10\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x7, x6\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x6, x6, x11\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x7, x7, x27\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x11, x10, x9\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x9, x9, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x8, x8, x11\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x10, x10, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x11, x13, x12\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic x27, x14, x13\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x15\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x12, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x11, %x[state], x11\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x12, x12, x27\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x15, x14\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x14, x14, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x13, x13, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x15, x15, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x19, x17\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic x27, x20, x19\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x16, x21\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x17, x16\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x16, x16, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x17, x17, x27\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x21, x20\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x20, x20, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x19, x19, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x21, x21, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x24, x23\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic x27, x25, x24\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x22, x26\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x23, x22\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x22, x22, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x23, x23, x27\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x26, x25\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x25, x25, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x24, x24, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x26, x26, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_SHA3_transform_blocksx3_neon_r] "S" (L_SHA3_transform_blocksx3_neon_r) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +static const uint64_t L_SHA3_shake128_blocksx3_seed_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]\n\t" + "add x28, x28, :lo12:%[L_SHA3_shake128_blocksx3_seed_neon_r]\n\t" +#else + "adrp x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]@PAGE\n\t" + "add x28, x28, %[L_SHA3_shake128_blocksx3_seed_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "eor v16.16b, v16.16b, v16.16b\n\t" + "eor x19, x19, x19\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "movz x23, #0x8000, lsl 48\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v20.2d, x23\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x2, x7\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x12\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x17\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x27\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x23\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x25\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x5, x10\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x15\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x21\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x26\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x7, x7, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x12, x12, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x4, x4, %x[seed]\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x9, x9, %x[seed]\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x14, x14, %x[seed]\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x20, x20, %x[seed]\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x25, x25, %x[seed]\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x6, x6, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x11, x11, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x16, x16, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x22, x22, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x27, x27, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x3, x3, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x8, x8, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x13, x13, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x19, x19, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x24, x24, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x5, x5, %x[seed]\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x10, x10, %x[seed]\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x15, x15, %x[seed]\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x21, x21, %x[seed]\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x3, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x3, x8, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x8, x11, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x11, x25, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x25, x16, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x16, x23, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x23, x4, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x4, x14, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x14, x15, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x15, x22, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x22, x26, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x26, x17, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x17, x6, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x6, x27, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x27, x24, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x24, x10, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x10, x19, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x19, x7, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x7, x5, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x5, x21, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x21, x20, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x20, x13, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x13, x9, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x12, x4, x3\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic %x[seed], x5, x4\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x2, x6\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x3, x2\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x2, x2, x12\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x3, x3, %x[seed]\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x12, x6, x5\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x5, x5, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x4, x4, x12\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x6, x6, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x12, x9, x8\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic %x[seed], x10, x9\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x7, x11\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x8, x7\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x7, x7, x12\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x8, x8, %x[seed]\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x12, x11, x10\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x10, x10, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x9, x9, x12\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x11, x11, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x12, x14, x13\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic %x[seed], x15, x14\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x16\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x13, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x12, %x[state], x12\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x13, x13, %x[seed]\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x16, x15\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x15, x15, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x14, x14, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x16, x16, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x20, x19\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic %x[seed], x21, x20\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x17, x22\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x19, x17\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x19, x19, %x[seed]\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x22, x21\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x21, x21, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x20, x20, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x22, x22, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x25, x24\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic %x[seed], x26, x25\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x23, x27\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x24, x23\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x24, x24, %x[seed]\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x27, x26\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x26, x26, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x25, x25, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x27, x27, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_SHA3_shake128_blocksx3_seed_neon_r] "S" (L_SHA3_shake128_blocksx3_seed_neon_r) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +static const uint64_t L_SHA3_shake256_blocksx3_seed_neon_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]\n\t" + "add x28, x28, :lo12:%[L_SHA3_shake256_blocksx3_seed_neon_r]\n\t" +#else + "adrp x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]@PAGE\n\t" + "add x28, x28, %[L_SHA3_shake256_blocksx3_seed_neon_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "movz x19, #0x8000, lsl 48\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "eor x23, x23, x23\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v16.2d, x19\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x2, x7\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x12\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x17\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x27\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x23\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x25\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x5, x10\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x15\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x21\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x26\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x7, x7, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x12, x12, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x4, x4, %x[seed]\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x9, x9, %x[seed]\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x14, x14, %x[seed]\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x20, x20, %x[seed]\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x25, x25, %x[seed]\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x6, x6, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x11, x11, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x16, x16, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x22, x22, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x27, x27, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x3, x3, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x8, x8, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x13, x13, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x19, x19, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x24, x24, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x5, x5, %x[seed]\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x10, x10, %x[seed]\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x15, x15, %x[seed]\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x21, x21, %x[seed]\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x3, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x3, x8, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x8, x11, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x11, x25, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x25, x16, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x16, x23, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x23, x4, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x4, x14, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x14, x15, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x15, x22, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x22, x26, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x26, x17, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x17, x6, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x6, x27, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x27, x24, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x24, x10, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x10, x19, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x19, x7, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x7, x5, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x5, x21, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x21, x20, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x20, x13, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x13, x9, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x12, x4, x3\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic %x[seed], x5, x4\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x2, x6\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x3, x2\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x2, x2, x12\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x3, x3, %x[seed]\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x12, x6, x5\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x5, x5, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x4, x4, x12\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x6, x6, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x12, x9, x8\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic %x[seed], x10, x9\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x7, x11\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x8, x7\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x7, x7, x12\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x8, x8, %x[seed]\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x12, x11, x10\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x10, x10, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x9, x9, x12\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x11, x11, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x12, x14, x13\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic %x[seed], x15, x14\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x16\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x13, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x12, %x[state], x12\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x13, x13, %x[seed]\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x16, x15\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x15, x15, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x14, x14, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x16, x16, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x20, x19\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic %x[seed], x21, x20\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x17, x22\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x19, x17\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x19, x19, %x[seed]\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x22, x21\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x21, x21, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x20, x20, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x22, x22, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x25, x24\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic %x[seed], x26, x25\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x23, x27\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x24, x23\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x24, x24, %x[seed]\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x27, x26\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x26, x26, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x25, x25, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x27, x27, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_SHA3_shake256_blocksx3_seed_neon_r] "S" (L_SHA3_shake256_blocksx3_seed_neon_r) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_WC_KYBER */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-sha3-asm.S index 1652f41b4c..8dc8a20803 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S @@ -206,6 +206,251 @@ L_sha3_crypto_begin: #ifndef __APPLE__ .size BlockSha3,.-BlockSha3 #endif /* __APPLE__ */ +#else +#ifndef __APPLE__ + .text + .type L_SHA3_transform_base_r, %object + .section .rodata + .size L_SHA3_transform_base_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_transform_base_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl BlockSha3 +.type BlockSha3,@function +.align 2 +BlockSha3: +#else +.section __TEXT,__text +.globl _BlockSha3 +.p2align 2 +_BlockSha3: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-160]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] +#ifndef __APPLE__ + adrp x27, L_SHA3_transform_base_r + add x27, x27, :lo12:L_SHA3_transform_base_r +#else + adrp x27, L_SHA3_transform_base_r@PAGE + add x27, x27, :lo12:L_SHA3_transform_base_r@PAGEOFF +#endif /* __APPLE__ */ + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + str x0, [x29, #40] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_base_begin: + stp x27, x28, [x29, #48] + eor x0, x5, x10 + eor x30, x1, x6 + eor x28, x3, x8 + eor x0, x0, x15 + eor x30, x30, x11 + eor x28, x28, x13 + eor x0, x0, x21 + eor x30, x30, x16 + eor x28, x28, x19 + eor x0, x0, x26 + eor x30, x30, x22 + eor x28, x28, x24 + str x0, [x29, #32] + str x28, [x29, #24] + eor x27, x2, x7 + eor x28, x4, x9 + eor x27, x27, x12 + eor x28, x28, x14 + eor x27, x27, x17 + eor x28, x28, x20 + eor x27, x27, x23 + eor x28, x28, x25 + eor x0, x0, x27, ror 63 + eor x27, x27, x28, ror 63 + eor x1, x1, x0 + eor x6, x6, x0 + eor x11, x11, x0 + eor x16, x16, x0 + eor x22, x22, x0 + eor x3, x3, x27 + eor x8, x8, x27 + eor x13, x13, x27 + eor x19, x19, x27 + eor x24, x24, x27 + ldr x0, [x29, #32] + ldr x27, [x29, #24] + eor x28, x28, x30, ror 63 + eor x30, x30, x27, ror 63 + eor x27, x27, x0, ror 63 + eor x5, x5, x28 + eor x10, x10, x28 + eor x15, x15, x28 + eor x21, x21, x28 + eor x26, x26, x28 + eor x2, x2, x30 + eor x7, x7, x30 + eor x12, x12, x30 + eor x17, x17, x30 + eor x23, x23, x30 + eor x4, x4, x27 + eor x9, x9, x27 + eor x14, x14, x27 + eor x20, x20, x27 + eor x25, x25, x27 + # Swap Rotate + ror x0, x2, #63 + ror x2, x7, #20 + ror x7, x10, #44 + ror x10, x24, #3 + ror x24, x15, #25 + ror x15, x22, #46 + ror x22, x3, #2 + ror x3, x13, #21 + ror x13, x14, #39 + ror x14, x21, #56 + ror x21, x25, #8 + ror x25, x16, #23 + ror x16, x5, #37 + ror x5, x26, #50 + ror x26, x23, #62 + ror x23, x9, #9 + ror x9, x17, #19 + ror x17, x6, #28 + ror x6, x4, #36 + ror x4, x20, #43 + ror x20, x19, #49 + ror x19, x12, #54 + ror x12, x8, #58 + ror x8, x11, #61 + # Row Mix + bic x11, x3, x2 + bic x27, x4, x3 + bic x28, x1, x5 + bic x30, x2, x1 + eor x1, x1, x11 + eor x2, x2, x27 + bic x11, x5, x4 + eor x4, x4, x28 + eor x3, x3, x11 + eor x5, x5, x30 + bic x11, x8, x7 + bic x27, x9, x8 + bic x28, x6, x10 + bic x30, x7, x6 + eor x6, x6, x11 + eor x7, x7, x27 + bic x11, x10, x9 + eor x9, x9, x28 + eor x8, x8, x11 + eor x10, x10, x30 + bic x11, x13, x12 + bic x27, x14, x13 + bic x28, x0, x15 + bic x30, x12, x0 + eor x11, x0, x11 + eor x12, x12, x27 + bic x0, x15, x14 + eor x14, x14, x28 + eor x13, x13, x0 + eor x15, x15, x30 + bic x0, x19, x17 + bic x27, x20, x19 + bic x28, x16, x21 + bic x30, x17, x16 + eor x16, x16, x0 + eor x17, x17, x27 + bic x0, x21, x20 + eor x20, x20, x28 + eor x19, x19, x0 + eor x21, x21, x30 + bic x0, x24, x23 + bic x27, x25, x24 + bic x28, x22, x26 + bic x30, x23, x22 + eor x22, x22, x0 + eor x23, x23, x27 + bic x0, x26, x25 + eor x25, x25, x28 + eor x24, x24, x0 + eor x26, x26, x30 + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + eor x1, x1, x0 + bne L_SHA3_transform_base_begin + ldr x0, [x29, #40] + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp x29, x30, [sp], #0xa0 + ret +#ifndef __APPLE__ + .size BlockSha3,.-BlockSha3 +#endif /* __APPLE__ */ #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ #endif /* WOLFSSL_SHA3 */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c index bb4114d42b..e52d02de1b 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -181,6 +181,222 @@ void BlockSha3(unsigned long* state) ); } +#else +static const uint64_t L_SHA3_transform_base_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void BlockSha3(unsigned long* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_SHA3_transform_base_r]\n\t" + "add x27, x27, :lo12:%[L_SHA3_transform_base_r]\n\t" +#else + "adrp x27, %[L_SHA3_transform_base_r]@PAGE\n\t" + "add x27, x27, %[L_SHA3_transform_base_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "str %x[state], [x29, #40]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_base_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + "eor %x[state], x5, x10\n\t" + "eor x30, x1, x6\n\t" + "eor x28, x3, x8\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor x30, x30, x11\n\t" + "eor x28, x28, x13\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor x30, x30, x16\n\t" + "eor x28, x28, x19\n\t" + "eor %x[state], %x[state], x26\n\t" + "eor x30, x30, x22\n\t" + "eor x28, x28, x24\n\t" + "str %x[state], [x29, #32]\n\t" + "str x28, [x29, #24]\n\t" + "eor x27, x2, x7\n\t" + "eor x28, x4, x9\n\t" + "eor x27, x27, x12\n\t" + "eor x28, x28, x14\n\t" + "eor x27, x27, x17\n\t" + "eor x28, x28, x20\n\t" + "eor x27, x27, x23\n\t" + "eor x28, x28, x25\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "eor x27, x27, x28, ror 63\n\t" + "eor x1, x1, %x[state]\n\t" + "eor x6, x6, %x[state]\n\t" + "eor x11, x11, %x[state]\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x3, x3, x27\n\t" + "eor x8, x8, x27\n\t" + "eor x13, x13, x27\n\t" + "eor x19, x19, x27\n\t" + "eor x24, x24, x27\n\t" + "ldr %x[state], [x29, #32]\n\t" + "ldr x27, [x29, #24]\n\t" + "eor x28, x28, x30, ror 63\n\t" + "eor x30, x30, x27, ror 63\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + "eor x5, x5, x28\n\t" + "eor x10, x10, x28\n\t" + "eor x15, x15, x28\n\t" + "eor x21, x21, x28\n\t" + "eor x26, x26, x28\n\t" + "eor x2, x2, x30\n\t" + "eor x7, x7, x30\n\t" + "eor x12, x12, x30\n\t" + "eor x17, x17, x30\n\t" + "eor x23, x23, x30\n\t" + "eor x4, x4, x27\n\t" + "eor x9, x9, x27\n\t" + "eor x14, x14, x27\n\t" + "eor x20, x20, x27\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate */ + "ror %x[state], x2, #63\n\t" + "ror x2, x7, #20\n\t" + "ror x7, x10, #44\n\t" + "ror x10, x24, #3\n\t" + "ror x24, x15, #25\n\t" + "ror x15, x22, #46\n\t" + "ror x22, x3, #2\n\t" + "ror x3, x13, #21\n\t" + "ror x13, x14, #39\n\t" + "ror x14, x21, #56\n\t" + "ror x21, x25, #8\n\t" + "ror x25, x16, #23\n\t" + "ror x16, x5, #37\n\t" + "ror x5, x26, #50\n\t" + "ror x26, x23, #62\n\t" + "ror x23, x9, #9\n\t" + "ror x9, x17, #19\n\t" + "ror x17, x6, #28\n\t" + "ror x6, x4, #36\n\t" + "ror x4, x20, #43\n\t" + "ror x20, x19, #49\n\t" + "ror x19, x12, #54\n\t" + "ror x12, x8, #58\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix */ + "bic x11, x3, x2\n\t" + "bic x27, x4, x3\n\t" + "bic x28, x1, x5\n\t" + "bic x30, x2, x1\n\t" + "eor x1, x1, x11\n\t" + "eor x2, x2, x27\n\t" + "bic x11, x5, x4\n\t" + "eor x4, x4, x28\n\t" + "eor x3, x3, x11\n\t" + "eor x5, x5, x30\n\t" + "bic x11, x8, x7\n\t" + "bic x27, x9, x8\n\t" + "bic x28, x6, x10\n\t" + "bic x30, x7, x6\n\t" + "eor x6, x6, x11\n\t" + "eor x7, x7, x27\n\t" + "bic x11, x10, x9\n\t" + "eor x9, x9, x28\n\t" + "eor x8, x8, x11\n\t" + "eor x10, x10, x30\n\t" + "bic x11, x13, x12\n\t" + "bic x27, x14, x13\n\t" + "bic x28, %x[state], x15\n\t" + "bic x30, x12, %x[state]\n\t" + "eor x11, %x[state], x11\n\t" + "eor x12, x12, x27\n\t" + "bic %x[state], x15, x14\n\t" + "eor x14, x14, x28\n\t" + "eor x13, x13, %x[state]\n\t" + "eor x15, x15, x30\n\t" + "bic %x[state], x19, x17\n\t" + "bic x27, x20, x19\n\t" + "bic x28, x16, x21\n\t" + "bic x30, x17, x16\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x17, x17, x27\n\t" + "bic %x[state], x21, x20\n\t" + "eor x20, x20, x28\n\t" + "eor x19, x19, %x[state]\n\t" + "eor x21, x21, x30\n\t" + "bic %x[state], x24, x23\n\t" + "bic x27, x25, x24\n\t" + "bic x28, x22, x26\n\t" + "bic x30, x23, x22\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x23, x23, x27\n\t" + "bic %x[state], x26, x25\n\t" + "eor x25, x25, x28\n\t" + "eor x24, x24, %x[state]\n\t" + "eor x26, x26, x30\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "eor x1, x1, %x[state]\n\t" + "bne L_SHA3_transform_base_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_SHA3_transform_base_r] "S" (L_SHA3_transform_base_r) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "cc" + ); +} + #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ #endif /* WOLFSSL_SHA3 */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 5ff72c37b7..af3efbd1c2 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -156,8 +156,7 @@ _Transform_Sha512_Len_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #16] - str x19, [x29, #24] + stp x17, x19, [x29, #16] stp x20, x21, [x29, #32] stp x22, x23, [x29, #48] stp x24, x25, [x29, #64] @@ -1082,8 +1081,7 @@ L_sha512_len_neon_start: stp x6, x7, [x0, #16] stp x8, x9, [x0, #32] stp x10, x11, [x0, #48] - ldr x17, [x29, #16] - ldr x19, [x29, #24] + ldp x17, x19, [x29, #16] ldp x20, x21, [x29, #32] ldp x22, x23, [x29, #48] ldp x24, x25, [x29, #64] diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c index 2bba29bcef..1a3596a61a 100644 --- a/wolfcrypt/src/sha3.c +++ b/wolfcrypt/src/sha3.c @@ -62,8 +62,7 @@ } #endif -#if (!defined(WOLFSSL_ARMASM) || (!defined(__arm__) && \ - !defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) #ifdef USE_INTEL_SPEEDUP #include diff --git a/wolfcrypt/src/wc_kyber.c b/wolfcrypt/src/wc_kyber.c index 8e56bcc0e0..aa03a42b67 100644 --- a/wolfcrypt/src/wc_kyber.c +++ b/wolfcrypt/src/wc_kyber.c @@ -51,10 +51,11 @@ /* Use SHA3-512 to generate 64-bytes of hash. */ #define KYBER_HASH_G kyber_hash512 /* Use SHAKE-256 as a key derivation function (KDF). */ -#ifdef USE_INTEL_SPEEDUP -#define KYBER_KDF kyber_kdf +#if defined(USE_INTEL_SPEEDUP) || \ + (defined(WOLFSSL_ARMASM) && defined(__aarch64__)) + #define KYBER_KDF kyber_kdf #else -#define KYBER_KDF wc_Shake256Hash + #define KYBER_KDF wc_Shake256Hash #endif /******************************************************************************/ diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index cf8a5b03e5..c614000cb6 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -167,6 +167,7 @@ const sword16 zetas_inv[KYBER_N / 2] = { }; +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Number-Theoretic Transform. * * @param [in, out] r Polynomial to transform. @@ -1045,6 +1046,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a, } #endif } +#endif /* Pointwise multiply elements of a and b, into r, and multiply by 2^-16. * @@ -1078,6 +1080,110 @@ void kyber_init(void) /******************************************************************************/ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + +/* Generate a public-private key pair from randomly generated data. + * + * @param [in, out] priv Private key vector of polynomials. + * @param [out] pub Public key vector of polynomials. + * @param [in] e Error values as a vector of polynomials. Modified. + * @param [in] a Random values in an array of vectors of polynomials. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_keygen(sword16* priv, sword16* pub, sword16* e, const sword16* a, + int kp) +{ + int i; + + /* Transform private key. All of result used in public key calculation */ + for (i = 0; i < kp; ++i) { + kyber_ntt(priv + i * KYBER_N); + } + + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply a by private into public polynomial. */ + kyber_pointwise_acc_mont(pub + i * KYBER_N, a + i * kp * KYBER_N, priv, + kp); + /* Convert public polynomial to Montgomery form. */ + kyber_to_mont(pub + i * KYBER_N); + /* Transform error values polynomial. */ + kyber_ntt(e + i * KYBER_N); + /* Add errors to public key and reduce. */ + kyber_add_reduce(pub + i * KYBER_N, e + i * KYBER_N); + } +} + +/* Encapsuluate message. + * + * @param [in] pub Public key vector of polynomials. + * @param [out] bp Vector of polynomials. + * @param [out] v Polynomial. + * @param [in] at Array of vector of polynomials. + * @param [in] sp Vector of polynomials. + * @param [in] ep Error Vector of polynomials. + * @param [in] epp Error polynomial. + * @param [in] m Message polynomial. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_encapsulate(const sword16* pub, sword16* bp, sword16* v, + const sword16* at, sword16* sp, const sword16* ep, const sword16* epp, + const sword16* m, int kp) +{ + int i; + + /* Transform sp. All of result used in calculation of bp and v. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(sp + i * KYBER_N); + } + + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply at by sp into bp polynomial. */ + kyber_pointwise_acc_mont(bp + i * KYBER_N, at + i * kp * KYBER_N, sp, + kp); + /* Inverse transform bp polynomial. */ + kyber_invntt(bp + i * KYBER_N); + /* Add errors to bp and reduce. */ + kyber_add_reduce(bp + i * KYBER_N, ep + i * KYBER_N); + } + + /* Multiply public key by sp into v polynomial. */ + kyber_pointwise_acc_mont(v, pub, sp, kp); + /* Inverse transform v. */ + kyber_invntt(v); + /* Add errors and message to v and reduce. */ + kyber_add3_reduce(v, epp, m); +} + +/* Decapsulate message. + * + * @param [in] priv Private key vector of polynomials. + * @param [out] mp Message polynomial. + * @param [in] bp Vector of polynomials containing error. + * @param [in] v Encapsulated message polynomial. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp, + const sword16* v, int kp) +{ + int i; + + /* Transform bp. All of result used in calculation of mp. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(bp + i * KYBER_N); + } + + /* Multiply private key by bp into mp polynomial. */ + kyber_pointwise_acc_mont(mp, priv, bp, kp); + /* Inverse transform mp. */ + kyber_invntt(mp); + /* Subtract errors (mp) out of v and reduce into mp. */ + kyber_rsub_reduce(mp, v); +} + +#else + /* Generate a public-private key pair from randomly generated data. * * @param [in, out] priv Private key vector of polynomials. @@ -1269,6 +1375,8 @@ void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp, } } +#endif + /******************************************************************************/ #ifdef USE_INTEL_SPEEDUP @@ -1578,8 +1686,237 @@ static int kyber_gen_matrix_k4_avx2(sword16* a, byte* seed, int transposed) return 0; } #endif /* KYBER1024 */ +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) +#ifdef WOLFSSL_KYBER512 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k2_aarch64(sword16* a, byte* seed, int transposed) +{ + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + if (!transposed) { + state[0*25 + 4] = 0x1f0000 + (0 << 8) + 0; + state[1*25 + 4] = 0x1f0000 + (0 << 8) + 1; + state[2*25 + 4] = 0x1f0000 + (1 << 8) + 0; + } + else { + state[0*25 + 4] = 0x1f0000 + (0 << 8) + 0; + state[1*25 + 4] = 0x1f0000 + (1 << 8) + 0; + state[2*25 + 4] = 0x1f0000 + (0 << 8) + 1; + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, KYBER_N - ctr0, + p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, KYBER_N - ctr1, + p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, KYBER_N - ctr2, + p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f0000 + (1 << 8) + 1; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[20] = 0x8000000000000000UL; + BlockSha3(state); + p = (byte*)state; + ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); + while (ctr0 < KYBER_N) { + BlockSha3(state); + ctr0 += kyber_rej_uniform_neon(a + ctr0, KYBER_N - ctr0, p, + XOF_BLOCK_SIZE); + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER768 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k3_aarch64(sword16* a, byte* seed, int transposed) +{ + int i; + int k; + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + for (k = 0; k < 3; k++) { + for (i = 0; i < 3; i++) { + if (!transposed) { + state[i*25 + 4] = 0x1f0000 + ((k << 8) + i); + } + else { + state[i*25 + 4] = 0x1f0000 + ((i << 8) + k); + } + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p +=25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + /* Create more blocks if too many rejected. */ + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, + KYBER_N - ctr0, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, + KYBER_N - ctr1, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, + KYBER_N - ctr2, p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER1024 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k4_aarch64(sword16* a, byte* seed, int transposed) +{ + int i; + int k; + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + for (k = 0; k < 5; k++) { + for (i = 0; i < 3; i++) { + byte bi = ((k * 3) + i) / 4; + byte bj = ((k * 3) + i) % 4; + if (!transposed) { + state[i*25 + 4] = 0x1f0000 + (bi << 8) + bj; + } + else { + state[i*25 + 4] = 0x1f0000 + (bj << 8) + bi; + } + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + /* Create more blocks if too many rejected. */ + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, + KYBER_N - ctr0, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, + KYBER_N - ctr1, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, + KYBER_N - ctr2, p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + } + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f0000 + (3 << 8) + 3; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[20] = 0x8000000000000000UL; + BlockSha3(state); + p = (byte*)state; + ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); + while (ctr0 < KYBER_N) { + BlockSha3(state); + ctr0 += kyber_rej_uniform_neon(a + ctr0, KYBER_N - ctr0, p, + XOF_BLOCK_SIZE); + } + + return 0; +} +#endif #endif /* USE_INTEL_SPEEDUP */ +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Absorb the seed data for squeezing out pseudo-random data. * * @param [in, out] shake128 SHAKE-128 object. @@ -1610,6 +1947,7 @@ static int kyber_xof_squeezeblocks(wc_Shake* shake128, byte* out, int blocks) { return wc_Shake128_SqueezeBlocks(shake128, out, blocks); } +#endif /* New/Initialize SHA-3 object. * @@ -1690,6 +2028,7 @@ void kyber_prf_free(wc_Shake* prf) wc_Shake256_Free(prf); } +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Create pseudo-random data from the key using SHAKE-256. * * @param [in, out] shake256 SHAKE-256 object. @@ -1739,6 +2078,7 @@ static int kyber_prf(wc_Shake* shake256, byte* out, unsigned int outLen, return ret; #endif } +#endif #ifdef USE_INTEL_SPEEDUP /* Create pseudo-random key from the seed using SHAKE-256. @@ -1777,6 +2117,36 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) } #endif +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) +/* Create pseudo-random key from the seed using SHAKE-256. + * + * @param [in] seed Data to derive from. + * @param [in] seedLen Length of data to derive from in bytes. + * @param [out] out Buffer to write to. + * @param [in] outLen Number of bytes to derive. + * @return 0 on success always. + */ +int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) +{ + word64 state[25]; + int i; + int len64 = seedLen / 8; + + for (i = 0; i < len64; i++) { + state[i] = ((word64*)seed)[i]; + } + state[len64] = 0x1f; + XMEMSET(state + len64 + 1, 0, (25 - len64 - 1) * sizeof(word64)); + state[WC_SHA3_256_COUNT - 1] = 0x8000000000000000UL; + + BlockSha3(state); + XMEMCPY(out, state, outLen); + + return 0; +} +#endif + +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Rejection sampling on uniform random bytes to generate uniform random * integers mod q. * @@ -1792,6 +2162,7 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, unsigned int i; unsigned int j; +#if defined(WOLFSSL_KYBER_SMALL) || !defined(WC_64BIT_CPU) /* Keep sampling until maximum number of integers reached or buffer used up. */ for (i = 0, j = 0; (i < len) && (j <= rLen - 3); j += 3) { @@ -1812,10 +2183,90 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, /* Move over used bytes. */ r += 3; } +#else + unsigned int minJ; + + minJ = len / 4 * 6; + if (minJ > rLen) + minJ = rLen; + i = 0; + for (j = 0; j < minJ; j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + p[i] = v0 & (0 - (v0 < KYBER_Q)); + i += v0 < KYBER_Q; + p[i] = v1 & (0 - (v1 < KYBER_Q)); + i += v1 < KYBER_Q; + p[i] = v2 & (0 - (v2 < KYBER_Q)); + i += v2 < KYBER_Q; + p[i] = v3 & (0 - (v3 < KYBER_Q)); + i += v3 < KYBER_Q; + + /* Move over used bytes. */ + r += 6; + } + if (j < rLen) { + for (; (i + 4 < len) && (j < rLen); j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + p[i] = v0; + i += v0 < KYBER_Q; + p[i] = v1; + i += v1 < KYBER_Q; + p[i] = v2; + i += v2 < KYBER_Q; + p[i] = v3; + i += v3 < KYBER_Q; + + /* Move over used bytes. */ + r += 6; + } + for (; (i < len) && (j < rLen); j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + /* Reject first 12-bit integer if greater than or equal to q. */ + if (v0 < KYBER_Q) { + p[i++] = v0; + } + /* Check second if we don't have enough integers yet. + * Reject second 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v1 < KYBER_Q)) { + p[i++] = v1; + } + /* Check second if we don't have enough integers yet. + * Reject third 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v2 < KYBER_Q)) { + p[i++] = v2; + } + /* Check second if we don't have enough integers yet. + * Reject fourth 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v3 < KYBER_Q)) { + p[i++] = v3; + } + + /* Move over used bytes. */ + r += 6; + } + } +#endif return i; } +#endif +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Deterministically generate a matrix (or transpose) of uniform integers mod q. * * Seed used with XOF to generate random bytes. @@ -1871,35 +2322,17 @@ static int kyber_gen_matrix_c(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, ret = kyber_xof_squeezeblocks(prf, rand, GEN_MATRIX_NBLOCKS); } if (ret == 0) { - #if (GEN_MATRIX_SIZE % 3) != 0 - unsigned int randLen; - #endif unsigned int ctr; /* Sample random bytes to create a polynomial. */ ctr = kyber_rej_uniform_c(a + j * KYBER_N, KYBER_N, rand, GEN_MATRIX_SIZE); /* Create more blocks if too many rejected. */ - #if (GEN_MATRIX_SIZE % 3) != 0 - randLen = GEN_MATRIX_SIZE; - while (ctr < KYBER_N) { - int off = randLen % 3; - int k; - for (k = 0; k < off; k++) { - rand[k] = rand[randLen - off + k]; - } - kyber_xof_squeezeblocks(prf, rand + off, 1); - randLen = off + XOF_BLOCK_SIZE; - ctr += kyber_rej_uniform_c(a + j * KYBER_N + ctr, - KYBER_N - ctr, rand, randLen); - } - #else while (ctr < KYBER_N) { kyber_xof_squeezeblocks(prf, rand, 1); ctr += kyber_rej_uniform_c(a + j * KYBER_N + ctr, KYBER_N - ctr, rand, XOF_BLOCK_SIZE); } - #endif } } } @@ -1911,6 +2344,7 @@ static int kyber_gen_matrix_c(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, return ret; } +#endif /* Deterministically generate a matrix (or transpose) of uniform integers mod q. * @@ -1932,6 +2366,9 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, #ifdef WOLFSSL_KYBER512 if (kp == KYBER512_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k2_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k2_avx2(a, seed, transposed); @@ -1941,11 +2378,15 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER512_K, seed, transposed); } +#endif } else #endif #ifdef WOLFSSL_KYBER768 if (kp == KYBER768_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k3_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k3_avx2(a, seed, transposed); @@ -1955,11 +2396,15 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER768_K, seed, transposed); } +#endif } else #endif #ifdef WOLFSSL_KYBER1024 if (kp == KYBER1024_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k4_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k4_avx2(a, seed, transposed); @@ -1969,6 +2414,7 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER1024_K, seed, transposed); } +#endif } else #endif @@ -1976,6 +2422,8 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, ret = BAD_STATE_E; } + (void)prf; + return ret; } @@ -2240,6 +2688,8 @@ static void kyber_cbd_eta3(sword16* p, const byte* r) } #endif +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Get noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2306,6 +2756,8 @@ static int kyber_get_noise_eta2_c(KYBER_PRF_T* prf, sword16* p, return ret; } +#endif + #ifdef USE_INTEL_SPEEDUP #define PRF_RAND_SZ (2 * SHA3_256_BYTES) @@ -2488,6 +2940,206 @@ static int kyber_get_noise_k4_avx2(KYBER_PRF_T* prf, sword16* vec1, #endif #endif /* USE_INTEL_SPEEDUP */ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + +#define PRF_RAND_SZ (2 * SHA3_256_BYTES) + +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + */ +static void kyber_get_noise_x3_eta2_aarch64(byte* rand, byte* seed, byte o) +{ + word64* state = (word64*)rand; + + state[0*25 + 4] = 0x1f00 + 0 + o; + state[1*25 + 4] = 0x1f00 + 1 + o; + state[2*25 + 4] = 0x1f00 + 2 + o; + + kyber_shake256_blocksx3_seed_neon(state, seed); +} + +#ifdef WOLFSSL_KYBER512 +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + */ +static void kyber_get_noise_x3_eta3_aarch64(byte* rand, byte* seed, byte o) +{ + word64 state[3 * 25]; + + state[0*25 + 4] = 0x1f00 + 0 + o; + state[1*25 + 4] = 0x1f00 + 1 + o; + state[2*25 + 4] = 0x1f00 + 2 + o; + + kyber_shake256_blocksx3_seed_neon(state, seed); + XMEMCPY(rand + 0 * ETA3_RAND_SIZE, state + 0*25, SHA3_256_BYTES); + XMEMCPY(rand + 1 * ETA3_RAND_SIZE, state + 1*25, SHA3_256_BYTES); + XMEMCPY(rand + 2 * ETA3_RAND_SIZE, state + 2*25, SHA3_256_BYTES); + kyber_sha3_blocksx3_neon(state); + rand += SHA3_256_BYTES; + XMEMCPY(rand + 0 * ETA3_RAND_SIZE, state + 0*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); + XMEMCPY(rand + 1 * ETA3_RAND_SIZE, state + 1*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); + XMEMCPY(rand + 2 * ETA3_RAND_SIZE, state + 2*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); +} + +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + * @return 0 on success. + */ +static void kyber_get_noise_eta3_aarch64(byte* rand, byte* seed, byte o) +{ + word64 state[25]; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + state[4] = 0x1f00 + o; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[16] = 0x8000000000000000UL; + BlockSha3(state); + XMEMCPY(rand , state, SHA3_256_BYTES); + BlockSha3(state); + XMEMCPY(rand + SHA3_256_BYTES, state, ETA3_RAND_SIZE - SHA3_256_BYTES); +} + +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k2_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + int ret = 0; + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta3_aarch64(rand, seed, 0); + kyber_cbd_eta3(vec1 , rand + 0 * ETA3_RAND_SIZE); + kyber_cbd_eta3(vec1 + KYBER_N, rand + 1 * ETA3_RAND_SIZE); + if (poly == NULL) { + kyber_cbd_eta3(vec2 , rand + 2 * ETA3_RAND_SIZE); + kyber_get_noise_eta3_aarch64(rand, seed, 3); + kyber_cbd_eta3(vec2 + KYBER_N, rand ); + } + else { + kyber_get_noise_x3_eta2_aarch64(rand, seed, 2); + kyber_cbd_eta2(vec2 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(poly , rand + 2 * 25 * 8); + } + + return ret; +} +#endif + +#ifdef WOLFSSL_KYBER768 +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + * @return 0 on success. + */ +static void kyber_get_noise_eta2_aarch64(byte* rand, byte* seed, byte o) +{ + word64* state = (word64*)rand; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f00 + o; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[16] = 0x8000000000000000UL; + BlockSha3(state); +} + +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k3_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta2_aarch64(rand, seed, 0); + kyber_cbd_eta2(vec1 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec1 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec1 + 2 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 3); + kyber_cbd_eta2(vec2 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec2 + 2 * KYBER_N, rand + 2 * 25 * 8); + if (poly != NULL) { + kyber_get_noise_eta2_aarch64(rand, seed, 6); + kyber_cbd_eta2(poly , rand + 0 * 25 * 8); + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER1024 +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k4_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + int ret = 0; + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta2_aarch64(rand, seed, 0); + kyber_cbd_eta2(vec1 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec1 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec1 + 2 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 3); + kyber_cbd_eta2(vec1 + 3 * KYBER_N, rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 , rand + 1 * 25 * 8); + kyber_cbd_eta2(vec2 + 1 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 6); + kyber_cbd_eta2(vec2 + 2 * KYBER_N, rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + 3 * KYBER_N, rand + 1 * 25 * 8); + if (poly != NULL) { + kyber_cbd_eta2(poly, rand + 2 * 25 * 8); + } + + return ret; +} +#endif +#endif /* __aarch64__ && WOLFSSL_ARMASM */ + +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Get the noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2531,6 +3183,8 @@ static int kyber_get_noise_c(KYBER_PRF_T* prf, int kp, sword16* vec1, int eta1, return ret; } +#endif /* __aarch64__ && WOLFSSL_ARMASM */ + /* Get the noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2549,6 +3203,9 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, #ifdef WOLFSSL_KYBER512 if (kp == KYBER512_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k2_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k2_avx2(prf, vec1, vec2, poly, seed); @@ -2563,11 +3220,15 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA3, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif #ifdef WOLFSSL_KYBER768 if (kp == KYBER768_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k3_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k3_avx2(vec1, vec2, poly, seed); @@ -2578,11 +3239,15 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA2, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif #ifdef WOLFSSL_KYBER1024 if (kp == KYBER1024_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k4_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k4_avx2(prf, vec1, vec2, poly, seed); @@ -2593,6 +3258,7 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA2, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif @@ -2600,11 +3266,14 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = BAD_STATE_E; } + (void)prf; + return ret; } /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Compare two byte arrays of equal size. * * @param [in] a First array to compare. @@ -2624,6 +3293,7 @@ static int kyber_cmp_c(const byte* a, const byte* b, int sz) } return 0 - ((-(word32)r) >> 31); } +#endif /* Compare two byte arrays of equal size. * @@ -2635,6 +3305,9 @@ static int kyber_cmp_c(const byte* a, const byte* b, int sz) */ int kyber_cmp(const byte* a, const byte* b, int sz) { +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + return kyber_cmp_neon(a, b, sz); +#else int fail; #ifdef USE_INTEL_SPEEDUP @@ -2648,10 +3321,13 @@ int kyber_cmp(const byte* a, const byte* b, int sz) } return fail; +#endif } /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Conditional subtraction of q to each coefficient of a polynomial. * * @param [in, out] p Polynomial. @@ -2667,6 +3343,12 @@ static KYBER_NOINLINE void kyber_csubq_c(sword16* p) } } +#else + +#define kyber_csubq_c kyber_csubq_neon + +#endif + /******************************************************************************/ #if defined(CONV_WITH_DIV) || !defined(WORD64_AVAILABLE) @@ -3511,6 +4193,7 @@ void kyber_decompress_5(sword16* p, const unsigned char* b) /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Convert bit from byte to 0 or (KYBER_Q + 1) / 2. * * Constant time implementation. @@ -3622,7 +4305,7 @@ static void kyber_to_msg_c(byte* msg, sword16* p) /* Reduce each coefficient to mod q. */ kyber_csubq_c(p); - /* All values are now positive. */ + /* All values are now in range. */ for (i = 0; i < KYBER_N / 8; i++) { #ifdef WOLFSSL_KYBER_SMALL @@ -3663,6 +4346,27 @@ void kyber_to_msg(byte* msg, sword16* p) kyber_to_msg_c(msg, p); } } +#else +/* Convert message to polynomial. + * + * @param [out] p Polynomial. + * @param [in] msg Message as a byte array. + */ +void kyber_from_msg(sword16* p, const byte* msg) +{ + kyber_from_msg_neon(p, msg); +} + +/* Convert polynomial to message. + * + * @param [out] msg Message as a byte array. + * @param [in] p Polynomial. + */ +void kyber_to_msg(byte* msg, sword16* p) +{ + kyber_to_msg_neon(msg, p); +} +#endif /******************************************************************************/ diff --git a/wolfssl/wolfcrypt/sha3.h b/wolfssl/wolfcrypt/sha3.h index 0120051508..f65c41d322 100644 --- a/wolfssl/wolfcrypt/sha3.h +++ b/wolfssl/wolfcrypt/sha3.h @@ -220,8 +220,7 @@ WOLFSSL_LOCAL void sha3_block_bmi2(word64* s); WOLFSSL_LOCAL void sha3_block_avx2(word64* s); WOLFSSL_LOCAL void BlockSha3(word64 *s); #endif -#if (defined(WOLFSSL_ARMASM) && (defined(__arm__) || \ - defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) || defined(WOLFSSL_RISCV_ASM) +#if defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM) WOLFSSL_LOCAL void BlockSha3(word64 *s); #endif diff --git a/wolfssl/wolfcrypt/wc_kyber.h b/wolfssl/wolfcrypt/wc_kyber.h index 34b3d64ed9..2b8ac8da22 100644 --- a/wolfssl/wolfcrypt/wc_kyber.h +++ b/wolfssl/wolfcrypt/wc_kyber.h @@ -163,7 +163,8 @@ WOLFSSL_LOCAL int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, sword16* vec2, sword16* poly, byte* seed); -#ifdef USE_INTEL_SPEEDUP +#if defined(USE_INTEL_SPEEDUP) || \ + (defined(WOLFSSL_ARMASM) && defined(__aarch64__)) WOLFSSL_LOCAL int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen); #endif @@ -288,6 +289,27 @@ void kyber_decompress_5_avx2(sword16* p, const byte* r); WOLFSSL_LOCAL int kyber_cmp_avx2(const byte* a, const byte* b, int sz); +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +WOLFSSL_LOCAL void kyber_ntt(sword16* r); +WOLFSSL_LOCAL void kyber_invntt(sword16* r); +WOLFSSL_LOCAL void kyber_basemul_mont(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_basemul_mont_add(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_add_reduce(sword16* r, const sword16* a); +WOLFSSL_LOCAL void kyber_add3_reduce(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_rsub_reduce(sword16* r, const sword16* a); +WOLFSSL_LOCAL void kyber_to_mont(sword16* p); +WOLFSSL_LOCAL void kyber_sha3_blocksx3_neon(word64* state); +WOLFSSL_LOCAL void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed); +WOLFSSL_LOCAL void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed); +WOLFSSL_LOCAL unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len, + const byte* r, unsigned int rLen); +WOLFSSL_LOCAL int kyber_cmp_neon(const byte* a, const byte* b, int sz); +WOLFSSL_LOCAL void kyber_csubq_neon(sword16* p); +WOLFSSL_LOCAL void kyber_from_msg_neon(sword16* p, const byte* msg); +WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p); #endif #ifdef __cplusplus