From 2323a5cf59d67c19895ba04cf959c197e133694e Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 26 Sep 2024 18:43:34 +1000 Subject: [PATCH] ARM32 ChaCha20, Poly1305: assembly code Add assembly code for ChaCha20 and Poly1305 on ARM32 when no NEON available. --- src/include.am | 12 +- wolfcrypt/src/chacha.c | 3 +- wolfcrypt/src/poly1305.c | 11 +- wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c | 2 +- wolfcrypt/src/port/arm/armv8-32-chacha-asm.S | 522 ++++++++++++++++ .../src/port/arm/armv8-32-chacha-asm_c.c | 569 ++++++++++++++++++ .../src/port/arm/armv8-32-poly1305-asm.S | 356 +++++++++++ .../src/port/arm/armv8-32-poly1305-asm_c.c | 388 ++++++++++++ wolfcrypt/src/port/arm/armv8-32-sha3-asm.S | 110 ++-- wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c | 41 +- wolfcrypt/src/port/arm/armv8-chacha.c | 117 +++- wolfcrypt/src/port/arm/armv8-poly1305.c | 126 +++- wolfssl/wolfcrypt/chacha.h | 8 +- wolfssl/wolfcrypt/poly1305.h | 21 +- 14 files changed, 2177 insertions(+), 109 deletions(-) create mode 100644 wolfcrypt/src/port/arm/armv8-32-chacha-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c create mode 100644 wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c diff --git a/src/include.am b/src/include.am index c3d8376a1d..dbda409a2f 100644 --- a/src/include.am +++ b/src/include.am @@ -924,8 +924,10 @@ if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm.S endif !BUILD_ARMASM_INLINE endif @@ -999,17 +1001,17 @@ endif if BUILD_CHACHA src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c -if BUILD_ARMASM_NEON -src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c -else if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm.S endif !BUILD_ARMASM_INLINE -endif BUILD_ARMASM +else if BUILD_RISCV_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c endif BUILD_RISCV_ASM @@ -1018,7 +1020,7 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha_asm.S endif BUILD_INTELASM endif !BUILD_X86_ASM -endif !BUILD_ARMASM_NEON +endif !BUILD_ARMASM if BUILD_POLY1305 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c endif BUILD_POLY1305 diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index f7ee6bba38..84b26eb564 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -72,8 +72,7 @@ Public domain. #endif /* HAVE_CHACHA */ -#if defined(WOLFSSL_ARMASM) && (!defined(WOLFSSL_ARMASM_NO_NEON) || \ - defined(__thumb__)) +#if defined(WOLFSSL_ARMASM) /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */ #elif defined(WOLFSSL_RISCV_ASM) diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index 48529d78c1..718289c4fd 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -232,7 +232,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac); } #endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ /* if not 64 bit then use 32 bit */ -#elif !defined(WOLFSSL_ARMASM) || !defined(__thumb__) +#elif !defined(WOLFSSL_ARMASM) static word32 U8TO32(const byte *p) { @@ -269,8 +269,7 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8]) } -#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ - !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) /* This local function operates on a message with a given number of bytes with a given ctx pointer to a Poly1305 structure. @@ -789,8 +788,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* (!WOLFSSL_ARMASM || (!__aarch64__ && !__thumb__)) && - * !WOLFSSL_RISCV_ASM */ +#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) @@ -885,8 +883,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) /* process full blocks */ if (bytes >= POLY1305_BLOCK_SIZE) { size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1)); -#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ - !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) int ret; ret = poly1305_blocks(ctx, m, want); if (ret != 0) diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index 97edaf4a9b..f8ba89ac09 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -411,7 +411,7 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) static const uint32_t L_AES_ARM32_rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000, + 0x1b000000, 0x36000000, }; void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks); diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S new file mode 100644 index 0000000000..77ec219081 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S @@ -0,0 +1,522 @@ +/* armv8-32-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE +#ifdef HAVE_CHACHA + .text + .align 4 + .globl wc_chacha_setiv + .type wc_chacha_setiv, %function +wc_chacha_setiv: + push {r4, lr} + add r3, r0, #52 + ldr r4, [r1] + ldr r12, [r1, #4] + ldr lr, [r1, #8] + str r2, [r0, #48] +#ifdef BIG_ENDIAN_ORDER + rev r4, r4 + rev r12, r12 + rev lr, lr +#endif /* BIG_ENDIAN_ORDER */ + stm r3, {r4, r12, lr} + pop {r4, pc} + .size wc_chacha_setiv,.-wc_chacha_setiv + .text + .type L_chacha_arm32_constants, %object + .size L_chacha_arm32_constants, 32 + .align 4 +L_chacha_arm32_constants: + .word 0x61707865 + .word 0x3120646e + .word 0x79622d36 + .word 0x6b206574 + .word 0x61707865 + .word 0x3320646e + .word 0x79622d32 + .word 0x6b206574 + .text + .align 4 + .globl wc_chacha_setkey + .type wc_chacha_setkey, %function +wc_chacha_setkey: + push {r4, r5, lr} + adr r3, L_chacha_arm32_constants + subs r2, r2, #16 + add r3, r3, r2 + # Start state with constants + ldm r3, {r4, r5, r12, lr} + stm r0!, {r4, r5, r12, lr} + # Next is first 16 bytes of key. + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r12, [r1, #8] + ldr lr, [r1, #12] +#ifdef BIG_ENDIAN_ORDER + rev r4, r4 + rev r5, r5 + rev r12, r12 + rev lr, lr +#endif /* BIG_ENDIAN_ORDER */ + stm r0!, {r4, r5, r12, lr} + # Next 16 bytes of key. + beq L_chacha_arm32_setkey_same_keyb_ytes + # Update key pointer for next 16 bytes. + add r1, r1, r2 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r12, [r1, #8] + ldr lr, [r1, #12] +L_chacha_arm32_setkey_same_keyb_ytes: + stm r0, {r4, r5, r12, lr} + pop {r4, r5, pc} + .size wc_chacha_setkey,.-wc_chacha_setkey +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .align 4 + .globl wc_chacha_crypt_bytes + .type wc_chacha_crypt_bytes, %function +wc_chacha_crypt_bytes: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #52 + mov lr, r0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r0, [sp, #32] + str r1, [sp, #36] +#else + strd r0, r1, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r2, [sp, #40] + str r3, [sp, #44] +#else + strd r2, r3, [sp, #40] +#endif +L_chacha_arm32_crypt_block: + # Put x[12]..x[15] onto stack. +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [lr, #48] + ldr r5, [lr, #52] +#else + ldrd r4, r5, [lr, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [lr, #56] + ldr r7, [lr, #60] +#else + ldrd r6, r7, [lr, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #16] + str r5, [sp, #20] +#else + strd r4, r5, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #24] + str r7, [sp, #28] +#else + strd r6, r7, [sp, #24] +#endif + # Load x[0]..x[12] into registers. + ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12} + # 10x 2 full rounds to perform. + mov lr, #10 + str lr, [sp, #48] +L_chacha_arm32_crypt_loop: + # 0, 4, 8, 12 + # 1, 5, 9, 13 + ldr lr, [sp, #20] + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor lr, lr, r1 + ror r12, r12, #16 + ror lr, lr, #16 + add r8, r8, r12 + add r9, r9, lr + eor r4, r4, r8 + eor r5, r5, r9 + ror r4, r4, #20 + ror r5, r5, #20 + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor lr, lr, r1 + ror r12, r12, #24 + ror lr, lr, #24 + add r8, r8, r12 + add r9, r9, lr + eor r4, r4, r8 + eor r5, r5, r9 + ror r4, r4, #25 + ror r5, r5, #25 + str r12, [sp, #16] + str lr, [sp, #20] + # 2, 6, 10, 14 + # 3, 7, 11, 15 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + add r2, r2, r6 + add r3, r3, r7 + eor r12, r12, r2 + eor lr, lr, r3 + ror r12, r12, #16 + ror lr, lr, #16 + add r10, r10, r12 + add r11, r11, lr + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #20 + ror r7, r7, #20 + add r2, r2, r6 + add r3, r3, r7 + eor r12, r12, r2 + eor lr, lr, r3 + ror r12, r12, #24 + ror lr, lr, #24 + add r10, r10, r12 + add r11, r11, lr + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #25 + ror r7, r7, #25 + # 3, 4, 9, 14 + # 0, 5, 10, 15 + add r3, r3, r4 + add r0, r0, r5 + eor r12, r12, r3 + eor lr, lr, r0 + ror r12, r12, #16 + ror lr, lr, #16 + add r9, r9, r12 + add r10, r10, lr + eor r4, r4, r9 + eor r5, r5, r10 + ror r4, r4, #20 + ror r5, r5, #20 + add r3, r3, r4 + add r0, r0, r5 + eor r12, r12, r3 + eor lr, lr, r0 + ror r12, r12, #24 + ror lr, lr, #24 + add r9, r9, r12 + add r10, r10, lr + eor r4, r4, r9 + eor r5, r5, r10 + ror r4, r4, #25 + ror r5, r5, #25 + str r12, [sp, #24] + str lr, [sp, #28] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + # 1, 6, 11, 12 + # 2, 7, 8, 13 + add r1, r1, r6 + add r2, r2, r7 + eor r12, r12, r1 + eor lr, lr, r2 + ror r12, r12, #16 + ror lr, lr, #16 + add r11, r11, r12 + add r8, r8, lr + eor r6, r6, r11 + eor r7, r7, r8 + ror r6, r6, #20 + ror r7, r7, #20 + add r1, r1, r6 + add r2, r2, r7 + eor r12, r12, r1 + eor lr, lr, r2 + ror r12, r12, #24 + ror lr, lr, #24 + add r11, r11, r12 + add r8, r8, lr + eor r6, r6, r11 + eor r7, r7, r8 + ror r6, r6, #25 + ror r7, r7, #25 + str lr, [sp, #20] + # Check if we have done enough rounds. + ldr lr, [sp, #48] + subs lr, lr, #1 + str lr, [sp, #48] + bgt L_chacha_arm32_crypt_loop + stm sp, {r8, r9, r10, r11, r12} + ldr lr, [sp, #32] + mov r12, sp + # Add in original state + ldm lr!, {r8, r9, r10, r11} + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + add r3, r3, r11 + ldm lr!, {r8, r9, r10, r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12!, {r8, r9} + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12!, {r8, r9} + ldm r12, {r8, r9} + ldm lr!, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + add r10, r10, #1 + stm r12!, {r8, r9} + str r10, [lr, #-8] + ldm r12, {r8, r9} + ldm lr, {r10, r11} + add r8, r8, r10 + add r9, r9, r11 + stm r12, {r8, r9} + ldr r12, [sp, #44] + cmp r12, #0x40 + blt L_chacha_arm32_crypt_lt_block + ldr r12, [sp, #40] + ldr lr, [sp, #36] + # XOR state into 64 bytes. + ldr r8, [r12] + ldr r9, [r12, #4] + ldr r10, [r12, #8] + ldr r11, [r12, #12] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + str r0, [lr] + str r1, [lr, #4] + str r2, [lr, #8] + str r3, [lr, #12] + ldr r8, [r12, #16] + ldr r9, [r12, #20] + ldr r10, [r12, #24] + ldr r11, [r12, #28] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #16] + str r5, [lr, #20] + str r6, [lr, #24] + str r7, [lr, #28] + ldr r4, [sp] + ldr r5, [sp, #4] + ldr r6, [sp, #8] + ldr r7, [sp, #12] + ldr r8, [r12, #32] + ldr r9, [r12, #36] + ldr r10, [r12, #40] + ldr r11, [r12, #44] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #32] + str r5, [lr, #36] + str r6, [lr, #40] + str r7, [lr, #44] + ldr r4, [sp, #16] + ldr r5, [sp, #20] + ldr r6, [sp, #24] + ldr r7, [sp, #28] + ldr r8, [r12, #48] + ldr r9, [r12, #52] + ldr r10, [r12, #56] + ldr r11, [r12, #60] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + str r4, [lr, #48] + str r5, [lr, #52] + str r6, [lr, #56] + str r7, [lr, #60] + ldr r3, [sp, #44] + add r12, r12, #0x40 + add lr, lr, #0x40 + str r12, [sp, #40] + str lr, [sp, #36] + subs r3, r3, #0x40 + ldr lr, [sp, #32] + str r3, [sp, #44] + bne L_chacha_arm32_crypt_block + b L_chacha_arm32_crypt_done +L_chacha_arm32_crypt_lt_block: + # Store in over field of ChaCha. + ldr lr, [sp, #32] + add r12, lr, #0x44 + stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7} + ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7} + stm r12, {r0, r1, r2, r3, r4, r5, r6, r7} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #40] + ldr r3, [sp, #44] +#else + ldrd r2, r3, [sp, #40] +#endif + ldr r1, [sp, #36] + rsb r12, r3, #0x40 + str r12, [lr, #64] + add lr, lr, #0x44 +L_chacha_arm32_crypt_16byte_loop: + cmp r3, #16 + blt L_chacha_arm32_crypt_word_loop + # 16 bytes of state XORed into message. + ldm lr!, {r4, r5, r6, r7} + ldr r8, [r2] + ldr r9, [r2, #4] + ldr r10, [r2, #8] + ldr r11, [r2, #12] + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + subs r3, r3, #16 + str r8, [r1] + str r9, [r1, #4] + str r10, [r1, #8] + str r11, [r1, #12] + beq L_chacha_arm32_crypt_done + add r2, r2, #16 + add r1, r1, #16 + b L_chacha_arm32_crypt_16byte_loop +L_chacha_arm32_crypt_word_loop: + cmp r3, #4 + blt L_chacha_arm32_crypt_byte_start + # 4 bytes of state XORed into message. + ldr r4, [lr] + ldr r8, [r2] + eor r8, r8, r4 + subs r3, r3, #4 + str r8, [r1] + beq L_chacha_arm32_crypt_done + add lr, lr, #4 + add r2, r2, #4 + add r1, r1, #4 + b L_chacha_arm32_crypt_word_loop +L_chacha_arm32_crypt_byte_start: + ldr r4, [lr] +L_chacha_arm32_crypt_byte_loop: + ldrb r8, [r2] + eor r8, r8, r4 + subs r3, r3, #1 + strb r8, [r1] + beq L_chacha_arm32_crypt_done + lsr r4, r4, #8 + add r2, r2, #1 + add r1, r1, #1 + b L_chacha_arm32_crypt_byte_loop +L_chacha_arm32_crypt_done: + add sp, sp, #52 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes + .text + .align 4 + .globl wc_chacha_use_over + .type wc_chacha_use_over, %function +wc_chacha_use_over: + push {r4, r5, r6, r7, r8, r9, lr} +L_chacha_arm32_over_16byte_loop: + cmp r3, #16 + blt L_chacha_arm32_over_word_loop + # 16 bytes of state XORed into message. + ldr r12, [r0] + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r2] + ldr r7, [r2, #4] + ldr r8, [r2, #8] + ldr r9, [r2, #12] + eor r12, r12, r6 + eor lr, lr, r7 + eor r4, r4, r8 + eor r5, r5, r9 + subs r3, r3, #16 + str r12, [r1] + str lr, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + beq L_chacha_arm32_over_done + add r0, r0, #16 + add r2, r2, #16 + add r1, r1, #16 + b L_chacha_arm32_over_16byte_loop +L_chacha_arm32_over_word_loop: + cmp r3, #4 + blt L_chacha_arm32_over_byte_loop + # 4 bytes of state XORed into message. + ldr r12, [r0] + ldr r6, [r2] + eor r12, r12, r6 + subs r3, r3, #4 + str r12, [r1] + beq L_chacha_arm32_over_done + add r0, r0, #4 + add r2, r2, #4 + add r1, r1, #4 + b L_chacha_arm32_over_word_loop +L_chacha_arm32_over_byte_loop: + # 4 bytes of state XORed into message. + ldrb r12, [r0] + ldrb r6, [r2] + eor r12, r12, r6 + subs r3, r3, #1 + strb r12, [r1] + beq L_chacha_arm32_over_done + add r0, r0, #1 + add r2, r2, #1 + add r1, r1, #1 + b L_chacha_arm32_over_byte_loop +L_chacha_arm32_over_done: + pop {r4, r5, r6, r7, r8, r9, pc} + .size wc_chacha_use_over,.-wc_chacha_use_over +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c new file mode 100644 index 0000000000..8c80fc4ad9 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c @@ -0,0 +1,569 @@ +/* armv8-32-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_CHACHA +#include + +void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p) +{ + register word32* x asm ("r0") = (word32*)x_p; + register const byte* iv asm ("r1") = (const byte*)iv_p; + register word32 counter asm ("r2") = (word32)counter_p; + + __asm__ __volatile__ ( + "add r3, %[x], #52\n\t" + "ldr r4, [%[iv]]\n\t" + "ldr r12, [%[iv], #4]\n\t" + "ldr lr, [%[iv], #8]\n\t" + "str %[counter], [%[x], #48]\n\t" +#ifdef BIG_ENDIAN_ORDER + "rev r4, r4\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "stm r3, {r4, r12, lr}\n\t" + : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter) + : + : "memory", "r3", "r12", "lr", "r4", "cc" + ); +} + +static const uint32_t L_chacha_arm32_constants[] = { + 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574, + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, +}; + +void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p) +{ + register word32* x asm ("r0") = (word32*)x_p; + register const byte* key asm ("r1") = (const byte*)key_p; + register word32 keySz asm ("r2") = (word32)keySz_p; + register uint32_t* L_chacha_arm32_constants_c asm ("r3") = (uint32_t*)&L_chacha_arm32_constants; + + __asm__ __volatile__ ( + "subs %[keySz], %[keySz], #16\n\t" + "add r3, r3, %[keySz]\n\t" + /* Start state with constants */ + "ldm r3, {r4, r5, r12, lr}\n\t" + "stm %[x]!, {r4, r5, r12, lr}\n\t" + /* Next is first 16 bytes of key. */ + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" + "ldr r12, [%[key], #8]\n\t" + "ldr lr, [%[key], #12]\n\t" +#ifdef BIG_ENDIAN_ORDER + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "stm %[x]!, {r4, r5, r12, lr}\n\t" + /* Next 16 bytes of key. */ + "beq L_chacha_arm32_setkey_same_keyb_ytes_%=\n\t" + /* Update key pointer for next 16 bytes. */ + "add %[key], %[key], %[keySz]\n\t" + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" + "ldr r12, [%[key], #8]\n\t" + "ldr lr, [%[key], #12]\n\t" + "\n" + "L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t" + "stm %[x], {r4, r5, r12, lr}\n\t" + : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c) + : + : "memory", "r12", "lr", "r4", "r5", "cc" + ); +} + +#ifdef WOLFSSL_ARMASM_NO_NEON +void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len_p) +{ + register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p; + register byte* c asm ("r1") = (byte*)c_p; + register const byte* m asm ("r2") = (const byte*)m_p; + register word32 len asm ("r3") = (word32)len_p; + + __asm__ __volatile__ ( + "sub sp, sp, #52\n\t" + "mov lr, %[ctx]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[ctx], [sp, #32]\n\t" + "str %[c], [sp, #36]\n\t" +#else + "strd %[ctx], %[c], [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[m], [sp, #40]\n\t" + "str %[len], [sp, #44]\n\t" +#else + "strd %[m], %[len], [sp, #40]\n\t" +#endif + "\n" + "L_chacha_arm32_crypt_block_%=: \n\t" + /* Put x[12]..x[15] onto stack. */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [lr, #48]\n\t" + "ldr r5, [lr, #52]\n\t" +#else + "ldrd r4, r5, [lr, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [lr, #56]\n\t" + "ldr r7, [lr, #60]\n\t" +#else + "ldrd r6, r7, [lr, #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #16]\n\t" + "str r5, [sp, #20]\n\t" +#else + "strd r4, r5, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #24]\n\t" + "str r7, [sp, #28]\n\t" +#else + "strd r6, r7, [sp, #24]\n\t" +#endif + /* Load x[0]..x[12] into registers. */ + "ldm lr, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + /* 10x 2 full rounds to perform. */ + "mov lr, #10\n\t" + "str lr, [sp, #48]\n\t" + "\n" + "L_chacha_arm32_crypt_loop_%=: \n\t" + /* 0, 4, 8, 12 */ + /* 1, 5, 9, 13 */ + "ldr lr, [sp, #20]\n\t" + "add %[ctx], %[ctx], r4\n\t" + "add %[c], %[c], r5\n\t" + "eor r12, r12, %[ctx]\n\t" + "eor lr, lr, %[c]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r8, r8, r12\n\t" + "add r9, r9, lr\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "ror r4, r4, #20\n\t" + "ror r5, r5, #20\n\t" + "add %[ctx], %[ctx], r4\n\t" + "add %[c], %[c], r5\n\t" + "eor r12, r12, %[ctx]\n\t" + "eor lr, lr, %[c]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r8, r8, r12\n\t" + "add r9, r9, lr\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "ror r4, r4, #25\n\t" + "ror r5, r5, #25\n\t" + "str r12, [sp, #16]\n\t" + "str lr, [sp, #20]\n\t" + /* 2, 6, 10, 14 */ + /* 3, 7, 11, 15 */ + "ldr r12, [sp, #24]\n\t" + "ldr lr, [sp, #28]\n\t" + "add %[m], %[m], r6\n\t" + "add %[len], %[len], r7\n\t" + "eor r12, r12, %[m]\n\t" + "eor lr, lr, %[len]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r10, r10, r12\n\t" + "add r11, r11, lr\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ror r6, r6, #20\n\t" + "ror r7, r7, #20\n\t" + "add %[m], %[m], r6\n\t" + "add %[len], %[len], r7\n\t" + "eor r12, r12, %[m]\n\t" + "eor lr, lr, %[len]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r10, r10, r12\n\t" + "add r11, r11, lr\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ror r6, r6, #25\n\t" + "ror r7, r7, #25\n\t" + /* 3, 4, 9, 14 */ + /* 0, 5, 10, 15 */ + "add %[len], %[len], r4\n\t" + "add %[ctx], %[ctx], r5\n\t" + "eor r12, r12, %[len]\n\t" + "eor lr, lr, %[ctx]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r9, r9, r12\n\t" + "add r10, r10, lr\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ror r4, r4, #20\n\t" + "ror r5, r5, #20\n\t" + "add %[len], %[len], r4\n\t" + "add %[ctx], %[ctx], r5\n\t" + "eor r12, r12, %[len]\n\t" + "eor lr, lr, %[ctx]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r9, r9, r12\n\t" + "add r10, r10, lr\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ror r4, r4, #25\n\t" + "ror r5, r5, #25\n\t" + "str r12, [sp, #24]\n\t" + "str lr, [sp, #28]\n\t" + "ldr r12, [sp, #16]\n\t" + "ldr lr, [sp, #20]\n\t" + /* 1, 6, 11, 12 */ + /* 2, 7, 8, 13 */ + "add %[c], %[c], r6\n\t" + "add %[m], %[m], r7\n\t" + "eor r12, r12, %[c]\n\t" + "eor lr, lr, %[m]\n\t" + "ror r12, r12, #16\n\t" + "ror lr, lr, #16\n\t" + "add r11, r11, r12\n\t" + "add r8, r8, lr\n\t" + "eor r6, r6, r11\n\t" + "eor r7, r7, r8\n\t" + "ror r6, r6, #20\n\t" + "ror r7, r7, #20\n\t" + "add %[c], %[c], r6\n\t" + "add %[m], %[m], r7\n\t" + "eor r12, r12, %[c]\n\t" + "eor lr, lr, %[m]\n\t" + "ror r12, r12, #24\n\t" + "ror lr, lr, #24\n\t" + "add r11, r11, r12\n\t" + "add r8, r8, lr\n\t" + "eor r6, r6, r11\n\t" + "eor r7, r7, r8\n\t" + "ror r6, r6, #25\n\t" + "ror r7, r7, #25\n\t" + "str lr, [sp, #20]\n\t" + /* Check if we have done enough rounds. */ + "ldr lr, [sp, #48]\n\t" + "subs lr, lr, #1\n\t" + "str lr, [sp, #48]\n\t" + "bgt L_chacha_arm32_crypt_loop_%=\n\t" + "stm sp, {r8, r9, r10, r11, r12}\n\t" + "ldr lr, [sp, #32]\n\t" + "mov r12, sp\n\t" + /* Add in original state */ + "ldm lr!, {r8, r9, r10, r11}\n\t" + "add %[ctx], %[ctx], r8\n\t" + "add %[c], %[c], r9\n\t" + "add %[m], %[m], r10\n\t" + "add %[len], %[len], r11\n\t" + "ldm lr!, {r8, r9, r10, r11}\n\t" + "add r4, r4, r8\n\t" + "add r5, r5, r9\n\t" + "add r6, r6, r10\n\t" + "add r7, r7, r11\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12!, {r8, r9}\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12!, {r8, r9}\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr!, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "add r10, r10, #1\n\t" + "stm r12!, {r8, r9}\n\t" + "str r10, [lr, #-8]\n\t" + "ldm r12, {r8, r9}\n\t" + "ldm lr, {r10, r11}\n\t" + "add r8, r8, r10\n\t" + "add r9, r9, r11\n\t" + "stm r12, {r8, r9}\n\t" + "ldr r12, [sp, #44]\n\t" + "cmp r12, #0x40\n\t" + "blt L_chacha_arm32_crypt_lt_block_%=\n\t" + "ldr r12, [sp, #40]\n\t" + "ldr lr, [sp, #36]\n\t" + /* XOR state into 64 bytes. */ + "ldr r8, [r12]\n\t" + "ldr r9, [r12, #4]\n\t" + "ldr r10, [r12, #8]\n\t" + "ldr r11, [r12, #12]\n\t" + "eor %[ctx], %[ctx], r8\n\t" + "eor %[c], %[c], r9\n\t" + "eor %[m], %[m], r10\n\t" + "eor %[len], %[len], r11\n\t" + "str %[ctx], [lr]\n\t" + "str %[c], [lr, #4]\n\t" + "str %[m], [lr, #8]\n\t" + "str %[len], [lr, #12]\n\t" + "ldr r8, [r12, #16]\n\t" + "ldr r9, [r12, #20]\n\t" + "ldr r10, [r12, #24]\n\t" + "ldr r11, [r12, #28]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #16]\n\t" + "str r5, [lr, #20]\n\t" + "str r6, [lr, #24]\n\t" + "str r7, [lr, #28]\n\t" + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r7, [sp, #12]\n\t" + "ldr r8, [r12, #32]\n\t" + "ldr r9, [r12, #36]\n\t" + "ldr r10, [r12, #40]\n\t" + "ldr r11, [r12, #44]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #32]\n\t" + "str r5, [lr, #36]\n\t" + "str r6, [lr, #40]\n\t" + "str r7, [lr, #44]\n\t" + "ldr r4, [sp, #16]\n\t" + "ldr r5, [sp, #20]\n\t" + "ldr r6, [sp, #24]\n\t" + "ldr r7, [sp, #28]\n\t" + "ldr r8, [r12, #48]\n\t" + "ldr r9, [r12, #52]\n\t" + "ldr r10, [r12, #56]\n\t" + "ldr r11, [r12, #60]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [lr, #48]\n\t" + "str r5, [lr, #52]\n\t" + "str r6, [lr, #56]\n\t" + "str r7, [lr, #60]\n\t" + "ldr %[len], [sp, #44]\n\t" + "add r12, r12, #0x40\n\t" + "add lr, lr, #0x40\n\t" + "str r12, [sp, #40]\n\t" + "str lr, [sp, #36]\n\t" + "subs %[len], %[len], #0x40\n\t" + "ldr lr, [sp, #32]\n\t" + "str %[len], [sp, #44]\n\t" + "bne L_chacha_arm32_crypt_block_%=\n\t" + "b L_chacha_arm32_crypt_done_%=\n\t" + "\n" + "L_chacha_arm32_crypt_lt_block_%=: \n\t" + /* Store in over field of ChaCha. */ + "ldr lr, [sp, #32]\n\t" + "add r12, lr, #0x44\n\t" + "stm r12!, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "ldm sp, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "stm r12, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr %[m], [sp, #40]\n\t" + "ldr %[len], [sp, #44]\n\t" +#else + "ldrd %[m], %[len], [sp, #40]\n\t" +#endif + "ldr %[c], [sp, #36]\n\t" + "rsb r12, %[len], #0x40\n\t" + "str r12, [lr, #64]\n\t" + "add lr, lr, #0x44\n\t" + "\n" + "L_chacha_arm32_crypt_16byte_loop_%=: \n\t" + "cmp %[len], #16\n\t" + "blt L_chacha_arm32_crypt_word_loop_%=\n\t" + /* 16 bytes of state XORed into message. */ + "ldm lr!, {r4, r5, r6, r7}\n\t" + "ldr r8, [%[m]]\n\t" + "ldr r9, [%[m], #4]\n\t" + "ldr r10, [%[m], #8]\n\t" + "ldr r11, [%[m], #12]\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "subs %[len], %[len], #16\n\t" + "str r8, [%[c]]\n\t" + "str r9, [%[c], #4]\n\t" + "str r10, [%[c], #8]\n\t" + "str r11, [%[c], #12]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "add %[m], %[m], #16\n\t" + "add %[c], %[c], #16\n\t" + "b L_chacha_arm32_crypt_16byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_word_loop_%=: \n\t" + "cmp %[len], #4\n\t" + "blt L_chacha_arm32_crypt_byte_start_%=\n\t" + /* 4 bytes of state XORed into message. */ + "ldr r4, [lr]\n\t" + "ldr r8, [%[m]]\n\t" + "eor r8, r8, r4\n\t" + "subs %[len], %[len], #4\n\t" + "str r8, [%[c]]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "add lr, lr, #4\n\t" + "add %[m], %[m], #4\n\t" + "add %[c], %[c], #4\n\t" + "b L_chacha_arm32_crypt_word_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_byte_start_%=: \n\t" + "ldr r4, [lr]\n\t" + "\n" + "L_chacha_arm32_crypt_byte_loop_%=: \n\t" + "ldrb r8, [%[m]]\n\t" + "eor r8, r8, r4\n\t" + "subs %[len], %[len], #1\n\t" + "strb r8, [%[c]]\n\t" + "beq L_chacha_arm32_crypt_done_%=\n\t" + "lsr r4, r4, #8\n\t" + "add %[m], %[m], #1\n\t" + "add %[c], %[c], #1\n\t" + "b L_chacha_arm32_crypt_byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_crypt_done_%=: \n\t" + "add sp, sp, #52\n\t" + : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word32 len_p) +{ + register byte* over asm ("r0") = (byte*)over_p; + register byte* output asm ("r1") = (byte*)output_p; + register const byte* input asm ("r2") = (const byte*)input_p; + register word32 len asm ("r3") = (word32)len_p; + + __asm__ __volatile__ ( + "\n" + "L_chacha_arm32_over_16byte_loop_%=: \n\t" + "cmp %[len], #16\n\t" + "blt L_chacha_arm32_over_word_loop_%=\n\t" + /* 16 bytes of state XORed into message. */ + "ldr r12, [%[over]]\n\t" + "ldr lr, [%[over], #4]\n\t" + "ldr r4, [%[over], #8]\n\t" + "ldr r5, [%[over], #12]\n\t" + "ldr r6, [%[input]]\n\t" + "ldr r7, [%[input], #4]\n\t" + "ldr r8, [%[input], #8]\n\t" + "ldr r9, [%[input], #12]\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "subs %[len], %[len], #16\n\t" + "str r12, [%[output]]\n\t" + "str lr, [%[output], #4]\n\t" + "str r4, [%[output], #8]\n\t" + "str r5, [%[output], #12]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #16\n\t" + "add %[input], %[input], #16\n\t" + "add %[output], %[output], #16\n\t" + "b L_chacha_arm32_over_16byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_word_loop_%=: \n\t" + "cmp %[len], #4\n\t" + "blt L_chacha_arm32_over_byte_loop_%=\n\t" + /* 4 bytes of state XORed into message. */ + "ldr r12, [%[over]]\n\t" + "ldr r6, [%[input]]\n\t" + "eor r12, r12, r6\n\t" + "subs %[len], %[len], #4\n\t" + "str r12, [%[output]]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #4\n\t" + "add %[input], %[input], #4\n\t" + "add %[output], %[output], #4\n\t" + "b L_chacha_arm32_over_word_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_byte_loop_%=: \n\t" + /* 4 bytes of state XORed into message. */ + "ldrb r12, [%[over]]\n\t" + "ldrb r6, [%[input]]\n\t" + "eor r12, r12, r6\n\t" + "subs %[len], %[len], #1\n\t" + "strb r12, [%[output]]\n\t" + "beq L_chacha_arm32_over_done_%=\n\t" + "add %[over], %[over], #1\n\t" + "add %[input], %[input], #1\n\t" + "add %[output], %[output], #1\n\t" + "b L_chacha_arm32_over_byte_loop_%=\n\t" + "\n" + "L_chacha_arm32_over_done_%=: \n\t" + : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), [len] "+r" (len) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ +#endif /* WOLFSSL_ARMASM */ + +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S new file mode 100644 index 0000000000..ffbd7b2705 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S @@ -0,0 +1,356 @@ +/* armv8-32-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE +#ifdef HAVE_POLY1305 + .text + .align 4 + .globl poly1305_blocks_arm32_16 + .type poly1305_blocks_arm32_16, %function +poly1305_blocks_arm32_16: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #28 + cmp r2, #0 + beq L_poly1305_arm32_16_done + add lr, sp, #12 + stm lr, {r0, r1, r2, r3} + # Get h pointer + add lr, r0, #16 + ldm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_loop: + # Add m to h + ldr r1, [sp, #16] + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r11, [sp, #24] + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r9 + adcs r7, r7, r10 + add r1, r1, #16 + adc r8, r8, r11 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + stm lr, {r4, r5, r6, r7, r8} +#else + # h[0]-h[2] in r4-r6 for multiplication. + str r7, [lr, #12] + str r8, [lr, #16] +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + str r1, [sp, #16] + ldr r1, [sp, #12] + # Multiply h by r +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] + ldr r3, [r1] + eor r0, r0, r0 + # r[0] * h[0] + # h[0] in r4 + umull r4, r5, r3, r4 + # r[0] * h[2] + # h[2] in r6 + umull r6, r7, r3, r6 + # r[0] * h[4] + # h[4] in r8 + mul r8, r3, r8 + # r[0] * h[1] + ldr r2, [lr, #4] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[0] * h[3] + ldr r2, [lr, #12] + adds r6, r6, r12 + adc r7, r7, r0 + umlal r7, r8, r3, r2 + # r[1] * h[0] + ldr r3, [r1, #4] + ldr r2, [lr] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[1] * h[1] + ldr r2, [lr, #4] + adds r6, r6, r12 + adc r12, r0, r0 + umlal r6, r12, r3, r2 + # r[1] * h[2] + ldr r2, [lr, #8] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[1] * h[3] + ldr r2, [lr, #12] + adds r8, r8, r12 + adc r9, r0, r0 + umlal r8, r9, r3, r2 + # r[1] * h[4] + ldr r2, [lr, #16] + mla r9, r3, r2, r9 + # r[2] * h[0] + ldr r3, [r1, #8] + ldr r2, [lr] + mov r12, r0 + umlal r6, r12, r3, r2 + # r[2] * h[1] + ldr r2, [lr, #4] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[2] * h[2] + ldr r2, [lr, #8] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[2] * h[3] + ldr r2, [lr, #12] + adds r9, r9, r12 + adc r10, r0, r0 + umlal r9, r10, r3, r2 + # r[2] * h[4] + ldr r2, [lr, #16] + mla r10, r3, r2, r10 + # r[3] * h[0] + ldr r3, [r1, #12] + ldr r2, [lr] + mov r12, r0 + umlal r7, r12, r3, r2 + # r[3] * h[1] + ldr r2, [lr, #4] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[3] * h[2] + ldr r2, [lr, #8] + adds r9, r9, r12 + adc r10, r10, r0 + umlal r9, r10, r3, r2 + # r[3] * h[3] + ldr r2, [lr, #12] + mov r11, r0 + umlal r10, r11, r3, r2 + # r[3] * h[4] + ldr r2, [lr, #16] + mov r12, r0 + mla r11, r3, r2, r11 +#else + ldm r1, {r0, r1, r2, r3} + # r[0] * h[0] + umull r10, r11, r0, r4 + # r[1] * h[0] + umull r12, r7, r1, r4 + # r[0] * h[1] + umaal r11, r12, r0, r5 + # r[2] * h[0] + umull r8, r9, r2, r4 + # r[1] * h[1] + umaal r12, r8, r1, r5 + # r[0] * h[2] + umaal r12, r7, r0, r6 + # r[3] * h[0] + umaal r8, r9, r3, r4 + stm sp, {r10, r11, r12} + # r[2] * h[1] + umaal r7, r8, r2, r5 + # Replace h[0] with h[3] + ldr r4, [lr, #12] + # r[1] * h[2] + umull r10, r11, r1, r6 + # r[2] * h[2] + umaal r8, r9, r2, r6 + # r[0] * h[3] + umaal r7, r10, r0, r4 + # r[3] * h[1] + umaal r8, r11, r3, r5 + # r[1] * h[3] + umaal r8, r10, r1, r4 + # r[3] * h[2] + umaal r9, r11, r3, r6 + # r[2] * h[3] + umaal r9, r10, r2, r4 + # Replace h[1] with h[4] + ldr r5, [lr, #16] + # r[3] * h[3] + umaal r10, r11, r3, r4 + mov r12, #0 + # r[0] * h[4] + umaal r8, r12, r0, r5 + # r[1] * h[4] + umaal r9, r12, r1, r5 + # r[2] * h[4] + umaal r10, r12, r2, r5 + # r[3] * h[4] + umaal r11, r12, r3, r5 + # DONE + ldm sp, {r4, r5, r6} +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + # r12 will be zero because r is masked. + # Load length + ldr r2, [sp, #20] + # Reduce mod 2^130 - 5 + bic r3, r8, #3 + and r8, r8, #3 + adds r4, r4, r3 + lsr r3, r3, #2 + adcs r5, r5, r9 + orr r3, r3, r9, LSL #30 + adcs r6, r6, r10 + lsr r9, r9, #2 + adcs r7, r7, r11 + orr r9, r9, r10, LSL #30 + adc r8, r8, r12 + lsr r10, r10, #2 + adds r4, r4, r3 + orr r10, r10, r11, LSL #30 + adcs r5, r5, r9 + lsr r11, r11, #2 + adcs r6, r6, r10 + adcs r7, r7, r11 + adc r8, r8, r12 + # Sub 16 from length. + subs r2, r2, #16 + # Store length. + str r2, [sp, #20] + # Loop again if more message to do. + bgt L_poly1305_arm32_16_loop + stm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_done: + add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16 + .text + .type L_poly1305_arm32_clamp, %object + .size L_poly1305_arm32_clamp, 16 + .align 4 +L_poly1305_arm32_clamp: + .word 0xfffffff + .word 0xffffffc + .word 0xffffffc + .word 0xffffffc + .text + .align 4 + .globl poly1305_set_key + .type poly1305_set_key, %function +poly1305_set_key: + push {r4, r5, r6, r7, r8, lr} + # Load mask. + adr lr, L_poly1305_arm32_clamp + ldm lr, {r6, r7, r8, r12} + # Load and cache padding. + ldr r2, [r1, #16] + ldr r3, [r1, #20] + ldr r4, [r1, #24] + ldr r5, [r1, #28] + add lr, r0, #36 + stm lr, {r2, r3, r4, r5} + # Load, mask and store r. + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r4, [r1, #8] + ldr r5, [r1, #12] + and r2, r2, r6 + and r3, r3, r7 + and r4, r4, r8 + and r5, r5, r12 + add lr, r0, #0 + stm lr, {r2, r3, r4, r5} + # h (accumulator) = 0 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + eor r12, r12, r12 + add lr, r0, #16 + eor r5, r5, r5 + stm lr, {r5, r6, r7, r8, r12} + # Zero leftover + str r5, [r0, #52] + pop {r4, r5, r6, r7, r8, pc} + .size poly1305_set_key,.-poly1305_set_key + .text + .align 4 + .globl poly1305_final + .type poly1305_final, %function +poly1305_final: + push {r4, r5, r6, r7, r8, r9, lr} + add r9, r0, #16 + ldm r9, {r4, r5, r6, r7, r8} + # Add 5 and check for h larger than p. + adds r2, r4, #5 + adcs r2, r5, #0 + adcs r2, r6, #0 + adcs r2, r7, #0 + adc r2, r8, #0 + sub r2, r2, #4 + lsr r2, r2, #31 + sub r2, r2, #1 + and r2, r2, #5 + # Add 0/5 to h. + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + # Add padding + add r9, r0, #36 + ldm r9, {r2, r3, r12, lr} + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r12 + adc r7, r7, lr + # Store MAC + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + # Zero out h. + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + add r9, r0, #16 + stm r9, {r4, r5, r6, r7, r8} + # Zero out r. + add r9, r0, #0 + stm r9, {r4, r5, r6, r7} + # Zero out padding. + add r9, r0, #36 + stm r9, {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r9, pc} + .size poly1305_final,.-poly1305_final +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c new file mode 100644 index 0000000000..2871293570 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c @@ -0,0 +1,388 @@ +/* armv8-32-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_POLY1305 +#include + +void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* m asm ("r1") = (const byte*)m_p; + register word32 len asm ("r2") = (word32)len_p; + register int notLast asm ("r3") = (int)notLast_p; + + __asm__ __volatile__ ( + "sub sp, sp, #28\n\t" + "cmp %[len], #0\n\t" + "beq L_poly1305_arm32_16_done_%=\n\t" + "add lr, sp, #12\n\t" + "stm lr, {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* Get h pointer */ + "add lr, %[ctx], #16\n\t" + "ldm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_loop_%=: \n\t" + /* Add m to h */ + "ldr %[m], [sp, #16]\n\t" + "ldr %[len], [%[m]]\n\t" + "ldr %[notLast], [%[m], #4]\n\t" + "ldr r9, [%[m], #8]\n\t" + "ldr r10, [%[m], #12]\n\t" + "ldr r11, [sp, #24]\n\t" + "adds r4, r4, %[len]\n\t" + "adcs r5, r5, %[notLast]\n\t" + "adcs r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "add %[m], %[m], #16\n\t" + "adc r8, r8, r11\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "stm lr, {r4, r5, r6, r7, r8}\n\t" +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + "str r7, [lr, #12]\n\t" + "str r8, [lr, #16]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "str %[m], [sp, #16]\n\t" + "ldr %[m], [sp, #12]\n\t" + /* Multiply h by r */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + "ldr %[notLast], [%[m]]\n\t" + "eor %[ctx], %[ctx], %[ctx]\n\t" + /* r[0] * h[0] */ + /* h[0] in r4 */ + "umull r4, r5, %[notLast], r4\n\t" + /* r[0] * h[2] */ + /* h[2] in r6 */ + "umull r6, r7, %[notLast], r6\n\t" + /* r[0] * h[4] */ + /* h[4] in r8 */ + "mul r8, %[notLast], r8\n\t" + /* r[0] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[0] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, %[ctx]\n\t" + "umlal r7, r8, %[notLast], %[len]\n\t" + /* r[1] * h[0] */ + "ldr %[notLast], [%[m], #4]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[1] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r6, r6, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[1] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[1] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r8, r8, r12\n\t" + "adc r9, %[ctx], %[ctx]\n\t" + "umlal r8, r9, %[notLast], %[len]\n\t" + /* r[1] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r9, %[notLast], %[len], r9\n\t" + /* r[2] * h[0] */ + "ldr %[notLast], [%[m], #8]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[2] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[2] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[2] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, %[ctx], %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[2] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r10, %[notLast], %[len], r10\n\t" + /* r[3] * h[0] */ + "ldr %[notLast], [%[m], #12]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[3] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[3] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, r10, %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[3] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "mov r11, %[ctx]\n\t" + "umlal r10, r11, %[notLast], %[len]\n\t" + /* r[3] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mov r12, %[ctx]\n\t" + "mla r11, %[notLast], %[len], r11\n\t" +#else + "ldm %[m], {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* r[0] * h[0] */ + "umull r10, r11, %[ctx], r4\n\t" + /* r[1] * h[0] */ + "umull r12, r7, %[m], r4\n\t" + /* r[0] * h[1] */ + "umaal r11, r12, %[ctx], r5\n\t" + /* r[2] * h[0] */ + "umull r8, r9, %[len], r4\n\t" + /* r[1] * h[1] */ + "umaal r12, r8, %[m], r5\n\t" + /* r[0] * h[2] */ + "umaal r12, r7, %[ctx], r6\n\t" + /* r[3] * h[0] */ + "umaal r8, r9, %[notLast], r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + /* r[2] * h[1] */ + "umaal r7, r8, %[len], r5\n\t" + /* Replace h[0] with h[3] */ + "ldr r4, [lr, #12]\n\t" + /* r[1] * h[2] */ + "umull r10, r11, %[m], r6\n\t" + /* r[2] * h[2] */ + "umaal r8, r9, %[len], r6\n\t" + /* r[0] * h[3] */ + "umaal r7, r10, %[ctx], r4\n\t" + /* r[3] * h[1] */ + "umaal r8, r11, %[notLast], r5\n\t" + /* r[1] * h[3] */ + "umaal r8, r10, %[m], r4\n\t" + /* r[3] * h[2] */ + "umaal r9, r11, %[notLast], r6\n\t" + /* r[2] * h[3] */ + "umaal r9, r10, %[len], r4\n\t" + /* Replace h[1] with h[4] */ + "ldr r5, [lr, #16]\n\t" + /* r[3] * h[3] */ + "umaal r10, r11, %[notLast], r4\n\t" + "mov r12, #0\n\t" + /* r[0] * h[4] */ + "umaal r8, r12, %[ctx], r5\n\t" + /* r[1] * h[4] */ + "umaal r9, r12, %[m], r5\n\t" + /* r[2] * h[4] */ + "umaal r10, r12, %[len], r5\n\t" + /* r[3] * h[4] */ + "umaal r11, r12, %[notLast], r5\n\t" + /* DONE */ + "ldm sp, {r4, r5, r6}\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + /* r12 will be zero because r is masked. */ + /* Load length */ + "ldr %[len], [sp, #20]\n\t" + /* Reduce mod 2^130 - 5 */ + "bic %[notLast], r8, #3\n\t" + "and r8, r8, #3\n\t" + "adds r4, r4, %[notLast]\n\t" + "lsr %[notLast], %[notLast], #2\n\t" + "adcs r5, r5, r9\n\t" + "orr %[notLast], %[notLast], r9, LSL #30\n\t" + "adcs r6, r6, r10\n\t" + "lsr r9, r9, #2\n\t" + "adcs r7, r7, r11\n\t" + "orr r9, r9, r10, LSL #30\n\t" + "adc r8, r8, r12\n\t" + "lsr r10, r10, #2\n\t" + "adds r4, r4, %[notLast]\n\t" + "orr r10, r10, r11, LSL #30\n\t" + "adcs r5, r5, r9\n\t" + "lsr r11, r11, #2\n\t" + "adcs r6, r6, r10\n\t" + "adcs r7, r7, r11\n\t" + "adc r8, r8, r12\n\t" + /* Sub 16 from length. */ + "subs %[len], %[len], #16\n\t" + /* Store length. */ + "str %[len], [sp, #20]\n\t" + /* Loop again if more message to do. */ + "bgt L_poly1305_arm32_16_loop_%=\n\t" + "stm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_done_%=: \n\t" + "add sp, sp, #28\n\t" + : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), [notLast] "+r" (notLast) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +static const uint32_t L_poly1305_arm32_clamp[] = { + 0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, +}; + +void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* key asm ("r1") = (const byte*)key_p; + register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") = (uint32_t*)&L_poly1305_arm32_clamp; + + __asm__ __volatile__ ( + /* Load mask. */ + "mov lr, %[L_poly1305_arm32_clamp]\n\t" + "ldm lr, {r6, r7, r8, r12}\n\t" + /* Load and cache padding. */ + "ldr r2, [%[key], #16]\n\t" + "ldr r3, [%[key], #20]\n\t" + "ldr r4, [%[key], #24]\n\t" + "ldr r5, [%[key], #28]\n\t" + "add lr, %[ctx], #36\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + /* Load, mask and store r. */ + "ldr r2, [%[key]]\n\t" + "ldr r3, [%[key], #4]\n\t" + "ldr r4, [%[key], #8]\n\t" + "ldr r5, [%[key], #12]\n\t" + "and r2, r2, r6\n\t" + "and r3, r3, r7\n\t" + "and r4, r4, r8\n\t" + "and r5, r5, r12\n\t" + "add lr, %[ctx], #0\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + /* h (accumulator) = 0 */ + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "eor r12, r12, r12\n\t" + "add lr, %[ctx], #16\n\t" + "eor r5, r5, r5\n\t" + "stm lr, {r5, r6, r7, r8, r12}\n\t" + /* Zero leftover */ + "str r5, [%[ctx], #52]\n\t" + : [ctx] "+r" (ctx), [key] "+r" (key), [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc" + ); +} + +void poly1305_final(Poly1305* ctx_p, byte* mac_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register byte* mac asm ("r1") = (byte*)mac_p; + + __asm__ __volatile__ ( + "add r9, %[ctx], #16\n\t" + "ldm r9, {r4, r5, r6, r7, r8}\n\t" + /* Add 5 and check for h larger than p. */ + "adds r2, r4, #5\n\t" + "adcs r2, r5, #0\n\t" + "adcs r2, r6, #0\n\t" + "adcs r2, r7, #0\n\t" + "adc r2, r8, #0\n\t" + "sub r2, r2, #4\n\t" + "lsr r2, r2, #31\n\t" + "sub r2, r2, #1\n\t" + "and r2, r2, #5\n\t" + /* Add 0/5 to h. */ + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + /* Add padding */ + "add r9, %[ctx], #36\n\t" + "ldm r9, {r2, r3, r12, lr}\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + /* Store MAC */ + "str r4, [%[mac]]\n\t" + "str r5, [%[mac], #4]\n\t" + "str r6, [%[mac], #8]\n\t" + "str r7, [%[mac], #12]\n\t" + /* Zero out h. */ + "eor r4, r4, r4\n\t" + "eor r5, r5, r5\n\t" + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "add r9, %[ctx], #16\n\t" + "stm r9, {r4, r5, r6, r7, r8}\n\t" + /* Zero out r. */ + "add r9, %[ctx], #0\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + /* Zero out padding. */ + "add r9, %[ctx], #36\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + : [ctx] "+r" (ctx), [mac] "+r" (mac) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc" + ); +} + +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __arm__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ +#endif /* WOLFSSL_ARMASM */ + +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S index 76629726f7..6077a88b3e 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S @@ -32,6 +32,8 @@ #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) #ifndef WOLFSSL_ARMASM_INLINE +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON .text .type L_sha3_arm2_neon_rt, %object .size L_sha3_arm2_neon_rt, 192 @@ -85,60 +87,6 @@ L_sha3_arm2_neon_rt: .word 0x0 .word 0x80008008 .word 0x80000000 - .text - .type L_sha3_arm2_rt, %object - .size L_sha3_arm2_rt, 192 - .align 4 -L_sha3_arm2_rt: - .word 0x1 - .word 0x0 - .word 0x8082 - .word 0x0 - .word 0x808a - .word 0x80000000 - .word 0x80008000 - .word 0x80000000 - .word 0x808b - .word 0x0 - .word 0x80000001 - .word 0x0 - .word 0x80008081 - .word 0x80000000 - .word 0x8009 - .word 0x80000000 - .word 0x8a - .word 0x0 - .word 0x88 - .word 0x0 - .word 0x80008009 - .word 0x0 - .word 0x8000000a - .word 0x0 - .word 0x8000808b - .word 0x0 - .word 0x8b - .word 0x80000000 - .word 0x8089 - .word 0x80000000 - .word 0x8003 - .word 0x80000000 - .word 0x8002 - .word 0x80000000 - .word 0x80 - .word 0x80000000 - .word 0x800a - .word 0x0 - .word 0x8000000a - .word 0x80000000 - .word 0x80008081 - .word 0x80000000 - .word 0x8080 - .word 0x80000000 - .word 0x80000001 - .word 0x0 - .word 0x80008008 - .word 0x80000000 -#ifndef WOLFSSL_ARMASM_NO_NEON .text .align 4 .globl BlockSha3 @@ -407,6 +355,59 @@ L_sha3_arm32_neon_begin: .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ #ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_sha3_arm2_rt, %object + .size L_sha3_arm2_rt, 192 + .align 4 +L_sha3_arm2_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 .text .align 4 .globl BlockSha3 @@ -2391,6 +2392,7 @@ L_sha3_arm32_begin: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c index 6d2efa1b0b..1a54d8af3a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c @@ -51,6 +51,8 @@ #define __asm__ __asm #define __volatile__ volatile #endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL, 0x8000000080008000UL, @@ -66,29 +68,12 @@ static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000080000001UL, 0x8000000080008008UL, }; -static const uint64_t L_sha3_arm2_rt[] = { - 0x0000000000000001UL, 0x0000000000008082UL, - 0x800000000000808aUL, 0x8000000080008000UL, - 0x000000000000808bUL, 0x0000000080000001UL, - 0x8000000080008081UL, 0x8000000000008009UL, - 0x000000000000008aUL, 0x0000000000000088UL, - 0x0000000080008009UL, 0x000000008000000aUL, - 0x000000008000808bUL, 0x800000000000008bUL, - 0x8000000000008089UL, 0x8000000000008003UL, - 0x8000000000008002UL, 0x8000000000000080UL, - 0x000000000000800aUL, 0x800000008000000aUL, - 0x8000000080008081UL, 0x8000000000008080UL, - 0x0000000080000001UL, 0x8000000080008008UL, -}; - #include -#ifndef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt; - register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt; __asm__ __volatile__ ( "sub sp, sp, #16\n\t" @@ -348,16 +333,31 @@ void BlockSha3(word64* state_p) "vst1.8 {d20-d23}, [%[state]]!\n\t" "vst1.8 {d24}, [%[state]]\n\t" "add sp, sp, #16\n\t" - : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) + : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) : - : "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" + : "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" ); } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_sha3_arm2_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + #include -#ifdef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; @@ -2348,6 +2348,7 @@ void BlockSha3(word64* state_p) } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index c7de0a265b..b5b516705a 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -29,7 +29,7 @@ #include -#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) +#if defined(WOLFSSL_ARMASM) #ifdef HAVE_CHACHA #include @@ -73,15 +73,43 @@ * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. */ -int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) +int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter) { +#ifndef __aarch64__ + int ret = 0; +#ifdef CHACHA_AEAD_TEST + word32 i; + + printf("NONCE : "); + if (iv != NULL) { + for (i = 0; i < CHACHA_IV_BYTES; i++) { + printf("%02x", iv[i]); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (iv == NULL)) { + ret = BAD_FUNC_ARG; + } + if (ret == 0) { + /* No unused bytes to XOR into input. */ + ctx->left = 0; + + /* Set counter and IV into state. */ + wc_chacha_setiv(ctx->X, iv, counter); + } + + return ret; +#else word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ #ifdef CHACHA_AEAD_TEST word32 i; printf("NONCE : "); for (i = 0; i < CHACHA_IV_BYTES; i++) { - printf("%02x", inIv[i]); + printf("%02x", iv[i]); } printf("\n\n"); #endif @@ -89,7 +117,7 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) if (ctx == NULL) return BAD_FUNC_ARG; - XMEMCPY(temp, inIv, CHACHA_IV_BYTES); + XMEMCPY(temp, iv, CHACHA_IV_BYTES); ctx->left = 0; ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ @@ -98,18 +126,54 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ return 0; +#endif } +#ifdef __aarch64__ /* "expand 32-byte k" as unsigned 32 byte */ static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; /* "expand 16-byte k" as unsigned 16 byte */ static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; +#endif /** * Key setup. 8 word iv (nonce) */ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) { +#ifndef __aarch64__ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + printf("ChaCha key used :\n"); + if (key != NULL) { + word32 i; + for (i = 0; i < keySz; i++) { + printf("%02x", key[i]); + if ((i % 8) == 7) + printf("\n"); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL)) { + ret = BAD_FUNC_ARG; + } + else if ((keySz != (CHACHA_MAX_KEY_SZ / 2)) && + (keySz != CHACHA_MAX_KEY_SZ )) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + ctx->left = 0; + + wc_chacha_setkey(ctx->X, key, keySz); + } + + return ret; +#else const word32* constants; const byte* k; @@ -169,8 +233,10 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) ctx->left = 0; return 0; +#endif } +#ifndef WOLFSSL_ARMASM_NO_NEON static const word32 L_chacha20_neon_inc_first_word[] = { 0x1, 0x0, @@ -2815,7 +2881,6 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, } - /** * Encrypt a stream of bytes */ @@ -2862,40 +2927,68 @@ static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); } } +#endif /** * API to encrypt/decrypt a message of any size. */ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, - word32 msglen) + word32 len) { +#ifdef WOLFSSL_ARMASM_NO_NEON + int ret = 0; + + if ((ctx == NULL) || (output == NULL) || (input == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Handle left over bytes from last block. */ + if ((ret == 0) && (len > 0) && (ctx->left > 0)) { + byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left; + word32 l = min(len, ctx->left); + + wc_chacha_use_over(over, output, input, l); + + ctx->left -= l; + input += l; + output += l; + len -= l; + } + + if ((ret == 0) && (len != 0)) { + wc_chacha_crypt_bytes(ctx, output, input, len); + } + + return ret; +#else if (ctx == NULL || output == NULL || input == NULL) return BAD_FUNC_ARG; /* handle left overs */ - if (msglen > 0 && ctx->left > 0) { + if (len > 0 && ctx->left > 0) { byte* out; word32 i; out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left; - for (i = 0; i < msglen && i < ctx->left; i++) { + for (i = 0; i < len && i < ctx->left; i++) { output[i] = (byte)(input[i] ^ out[i]); } ctx->left -= i; - msglen -= i; + len -= i; output += i; input += i; } - if (msglen == 0) { + if (len == 0) { return 0; } - wc_Chacha_encrypt_bytes(ctx, input, output, msglen); + wc_Chacha_encrypt_bytes(ctx, input, output, len); return 0; +#endif } #endif /* HAVE_CHACHA */ -#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index 4d838c7036..9527bbd9d1 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -32,7 +32,6 @@ #include #ifdef WOLFSSL_ARMASM -#ifdef __aarch64__ #ifdef HAVE_POLY1305 #include @@ -49,6 +48,8 @@ #include #endif +#ifdef __aarch64__ + static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx, const unsigned char *m, size_t bytes) { @@ -1118,6 +1119,127 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* HAVE_POLY1305 */ +#else +#ifdef __thumb__ +/* Process 16 bytes of message at a time. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + * @param [in] bytes Length of message in bytes. + */ +void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char* m, + size_t bytes) +{ + poly1305_blocks_thumb2_16(ctx, m, bytes, 1); +} + +/* Process 16 bytes of message. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + */ +void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m) +{ + poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} +#else +/* Process 16 bytes of message at a time. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + * @param [in] bytes Length of message in bytes. + */ +void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes) +{ + poly1305_blocks_arm32_16(ctx, m, bytes, 1); +} + +/* Process 16 bytes of message. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + */ +void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m) +{ + poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} +#endif + +/* Set the key for the Poly1305 operation. + * + * @param [in] ctx Poly1305 context. + * @param [in] key Key data to use. + * @param [in] keySz Size of key in bytes. Must be 32. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or key is NULL or keySz is not 32. + */ +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + if (key != NULL) { + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + } + printf("\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL) || (keySz != 32)) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + poly1305_set_key(ctx, key); + } + + return ret; +} + +/* Finalize the Poly1305 operation calculating the MAC. + * + * @param [in] ctx Poly1305 context. + * @param [in] mac Buffer to hold the MAC. Myst be at least 16 bytes long. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or mac is NULL. + */ +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + int ret = 0; + + /* Validate parameters. */ + if ((ctx == NULL) || (mac == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Process the remaining partial block - last block. */ + if (ret == 0) { + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) { + ctx->buffer[i] = 0; + } + #ifdef __thumb__ + poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, + 0); + #else + poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); + #endif + } + + poly1305_final(ctx, mac); + } + + return ret; +} + #endif /* __aarch64__ */ +#endif /* HAVE_POLY1305 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h index 42e71aee57..db4e5dd664 100644 --- a/wolfssl/wolfcrypt/chacha.h +++ b/wolfssl/wolfcrypt/chacha.h @@ -107,12 +107,18 @@ WOLFSSL_API int wc_XChacha_SetKey(ChaCha *ctx, const byte *key, word32 keySz, word32 counter); #endif -#if defined(WOLFSSL_ARMASM) && defined(__thumb__) +#if defined(WOLFSSL_ARMASM) + +#ifndef __aarch64__ void wc_chacha_setiv(word32* x, const byte* iv, word32 counter); void wc_chacha_setkey(word32* x, const byte* key, word32 keySz); +#endif + +#if defined(WOLFSSL_ARMASM_NO_NEON) || defined(__thumb__) void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len); void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len); +#endif #endif diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index bcc48a6298..70ed1efa83 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -98,7 +98,7 @@ typedef struct Poly1305 { word64 leftover; unsigned char buffer[POLY1305_BLOCK_SIZE]; unsigned char finished; -#elif defined(WOLFSSL_ARMASM) && defined(__thumb__) +#elif defined(WOLFSSL_ARMASM) word32 r[4]; word32 h[5]; word32 pad[4]; @@ -147,16 +147,16 @@ WOLFSSL_API int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz, const byte* input, word32 sz, byte* tag, word32 tagSz); -#if defined(__aarch64__ ) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) +#if defined(__aarch64__ ) #define poly1305_blocks poly1305_blocks_aarch64 #define poly1305_block poly1305_block_aarch64 void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, size_t bytes); void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m); -#endif - -#if defined(__thumb__ ) && defined(WOLFSSL_ARMASM) +#else +#if defined(__thumb__) #define poly1305_blocks poly1305_blocks_thumb2 #define poly1305_block poly1305_block_thumb2 @@ -166,9 +166,20 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char *m); void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m, word32 len, int notLast); +#else +#define poly1305_blocks poly1305_blocks_arm32 +#define poly1305_block poly1305_block_arm32 + +void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes); +void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m); + +void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len, + int notLast); +#endif void poly1305_set_key(Poly1305* ctx, const byte* key); void poly1305_final(Poly1305* ctx, byte* mac); #endif +#endif /* WOLFSSL_ARMASM */ #if defined(WOLFSSL_RISCV_ASM) #define poly1305_blocks poly1305_blocks_riscv64