diff --git a/admin/parse-asm/driver.p256.py b/admin/parse-asm/driver.p256.py index 4e3f72e2..4817d12e 100644 --- a/admin/parse-asm/driver.p256.py +++ b/admin/parse-asm/driver.p256.py @@ -511,6 +511,74 @@ ) parse_file(input, d) + with open( + "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S" + ) as input, open( + "../../graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs", "w" + ) as output: + d = RustDriver(output, Architecture_aarch64) + d.emit_rust_function( + "bignum_copy_row_from_table_8n_neon", + parameter_map=[ + ("inout", "x0", "z.as_mut_ptr() => _"), + ("inout", "x1", "table.as_ptr() => _"), + ("inout", "x2", "height => _"), + ("inout", "x3", "width => _"), + ("inout", "x4", "index => _"), + ], + rust_decl="pub fn bignum_copy_row_from_table_8n_neon(z: &mut [u64], table: &[u64], height: u64, width: u64, index: u64)", + assertions=[ + "z.len() as u64 == width", + "width % 8 == 0", + "index < height", + ], + ) + parse_file(input, d) + + with open( + "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S" + ) as input, open( + "../../graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs", "w" + ) as output: + d = RustDriver(output, Architecture_aarch64) + d.emit_rust_function( + "bignum_copy_row_from_table_16_neon", + parameter_map=[ + ("inout", "x0", "z.as_mut_ptr() => _"), + ("inout", "x1", "table.as_ptr() => _"), + ("inout", "x2", "height => _"), + ("inout", "x3", "index => _"), + ], + rust_decl="pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)", + assertions=[ + "z.len() == 16", + "index < height", + ], + ) + parse_file(input, d) + + with open( + "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S" + ) as input, open( + "../../graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs", "w" + ) as output: + d = RustDriver(output, Architecture_aarch64) + d.emit_rust_function( + "bignum_copy_row_from_table_32_neon", + parameter_map=[ + ("inout", "x0", "z.as_mut_ptr() => _"), + ("inout", "x1", "table.as_ptr() => _"), + ("inout", "x2", "height => _"), + ("inout", "x3", "index => _"), + ], + rust_decl="pub fn bignum_copy_row_from_table_32_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)", + assertions=[ + "z.len() == 32", + "index < height", + ], + ) + parse_file(input, d) + with open("../../thirdparty/s2n-bignum/arm/p256/p256_montjadd.S") as input, open( "../../graviola/src/low/aarch64/p256_montjadd.rs", "w" ) as output: diff --git a/admin/parse-asm/driver.py b/admin/parse-asm/driver.py index 52be8c62..9083ab0b 100644 --- a/admin/parse-asm/driver.py +++ b/admin/parse-asm/driver.py @@ -4,7 +4,7 @@ from io import StringIO import copy -from parse import Type, tokenise, is_comment +from parse import Type, register_from_token, tokenise, is_comment class Architecture: @@ -553,6 +553,8 @@ def contains_constant_ref(self, *values): def expand_rust_macros(self, *values, params={}): for v in values: for t in tokenise(v): + r = register_from_token(t) + if t in params: yield unquote("$" + t) elif t in self.rust_macros: @@ -564,6 +566,13 @@ def expand_rust_macros(self, *values, params={}): # must not have any arguments assert macro_args == None yield unquote("%s!()" % t) + elif r is not None and r.reg in self.rust_macros: + macro_value, macro_args = self.rust_macros[r.reg] + for vv in macro_value: + self.visit_operands(vv) + assert macro_args == None + yield unquote("%s!()" % r.reg) + yield r.suffix elif is_comment(t): yield unquote(t) elif t in self.constant_syms: diff --git a/admin/parse-asm/parse.py b/admin/parse-asm/parse.py index cb9a2d95..ab07f2a0 100644 --- a/admin/parse-asm/parse.py +++ b/admin/parse-asm/parse.py @@ -3,6 +3,7 @@ import glob import string import subprocess +from collections import namedtuple from os import path import io @@ -163,6 +164,20 @@ def is_comment(s): return s.startswith("/*") and s.endswith("*/") +register = namedtuple("Register", "reg suffix") + + +def register_from_token(t): + # fix up registers that are tokenised as one, but + # need to be treated as two for macro expansion, + # eg `ventry4.16b` needs to be split into `ventry4` + # (`ventry4` being a macro name) and `.16b` (neon + # width spec) + if t.count(".") == 1: + idx = t.find(".") + return register(t[:idx], t[idx:]) + + def tokenise(s): def tokenise_gen(s): symbol = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*") diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs new file mode 100644 index 00000000..dc606907 --- /dev/null +++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs @@ -0,0 +1,246 @@ +#![allow(non_upper_case_globals, unused_macros, unused_imports)] +use crate::low::macros::*; + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] +// into z[0..row-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// +// extern void bignum_copy_from_table_16_neon +// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); +// +// Initial version written by Hanno Becker +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx +// ---------------------------------------------------------------------------- + +// ***************************************************** +// Main code +// ***************************************************** + +macro_rules! z { + () => { + Q!("x0") + }; +} +macro_rules! tbl { + () => { + Q!("x1") + }; +} +macro_rules! height { + () => { + Q!("x2") + }; +} +macro_rules! idx { + () => { + Q!("x3") + }; +} + +macro_rules! mask { + () => { + Q!("x5") + }; +} +macro_rules! cnt { + () => { + Q!("x6") + }; +} + +macro_rules! ventry0 { + () => { + Q!("v20") + }; +} +macro_rules! qentry0 { + () => { + Q!("q20") + }; +} +macro_rules! ventry1 { + () => { + Q!("v21") + }; +} +macro_rules! qentry1 { + () => { + Q!("q21") + }; +} +macro_rules! ventry2 { + () => { + Q!("v22") + }; +} +macro_rules! qentry2 { + () => { + Q!("q22") + }; +} +macro_rules! ventry3 { + () => { + Q!("v23") + }; +} +macro_rules! qentry3 { + () => { + Q!("q23") + }; +} +macro_rules! ventry4 { + () => { + Q!("v24") + }; +} +macro_rules! qentry4 { + () => { + Q!("q24") + }; +} +macro_rules! ventry5 { + () => { + Q!("v25") + }; +} +macro_rules! qentry5 { + () => { + Q!("q25") + }; +} +macro_rules! ventry6 { + () => { + Q!("v26") + }; +} +macro_rules! qentry6 { + () => { + Q!("q26") + }; +} +macro_rules! ventry7 { + () => { + Q!("v27") + }; +} +macro_rules! qentry7 { + () => { + Q!("q27") + }; +} +macro_rules! ventry8 { + () => { + Q!("v28") + }; +} + +macro_rules! vtmp { + () => { + Q!("v16") + }; +} +macro_rules! qtmp { + () => { + Q!("q16") + }; +} + +macro_rules! vmask { + () => { + Q!("v17") + }; +} + +pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64) { + debug_assert!(z.len() == 16); + debug_assert!(index < height); + unsafe { + core::arch::asm!( + + + // Clear accumulator + // Zeroing can be done via xor, but xor isn't formalized yet. + Q!(" dup " ventry0!() ".2d, xzr"), + Q!(" mov " ventry1!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry2!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry3!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry4!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry5!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry6!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry7!() ".16b, " ventry0!() ".16b"), + + Q!(" mov " cnt!() ", #0"), + Q!(Label!("bignum_copy_row_from_table_16_neon_loop", 2) ":"), + + // Compute mask: Check if current index matches target index + Q!(" subs " "xzr, " cnt!() ", " idx!()), + Q!(" cinv " mask!() ", xzr, eq"), + Q!(" dup " vmask!() ".2d, " mask!()), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 0]"), + Q!(" bit " ventry0!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 1]"), + Q!(" bit " ventry1!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 2]"), + Q!(" bit " ventry2!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 3]"), + Q!(" bit " ventry3!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 4]"), + Q!(" bit " ventry4!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 5]"), + Q!(" bit " ventry5!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 6]"), + Q!(" bit " ventry6!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 7]"), + Q!(" bit " ventry7!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" add " tbl!() ", " tbl!() ", #16 * 8"), + + Q!(" add " cnt!() ", " cnt!() ", #1"), + Q!(" subs " "xzr, " height!() ", " cnt!()), + Q!(" b.ne " Label!("bignum_copy_row_from_table_16_neon_loop", 2, Before)), + + Q!(Label!("bignum_copy_row_from_table_16_neon_end", 3) ":"), + + Q!(" str " qentry0!() ", [" z!() ", #16 * 0]"), + Q!(" str " qentry1!() ", [" z!() ", #16 * 1]"), + Q!(" str " qentry2!() ", [" z!() ", #16 * 2]"), + Q!(" str " qentry3!() ", [" z!() ", #16 * 3]"), + Q!(" str " qentry4!() ", [" z!() ", #16 * 4]"), + Q!(" str " qentry5!() ", [" z!() ", #16 * 5]"), + Q!(" str " qentry6!() ", [" z!() ", #16 * 6]"), + Q!(" str " qentry7!() ", [" z!() ", #16 * 7]"), + + inout("x0") z.as_mut_ptr() => _, + inout("x1") table.as_ptr() => _, + inout("x2") height => _, + inout("x3") index => _, + // clobbers + out("v16") _, + out("v17") _, + out("v20") _, + out("v21") _, + out("v22") _, + out("v23") _, + out("v24") _, + out("v25") _, + out("v26") _, + out("v27") _, + out("x5") _, + out("x6") _, + ) + }; +} diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs new file mode 100644 index 00000000..c875ca9e --- /dev/null +++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs @@ -0,0 +1,369 @@ +#![allow(non_upper_case_globals, unused_macros, unused_imports)] +use crate::low::macros::*; + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] +// into z[0..row-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// +// extern void bignum_copy_from_table_32_neon +// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); +// +// Initial version written by Hanno Becker +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx +// ---------------------------------------------------------------------------- + +// ***************************************************** +// Main code +// ***************************************************** + +macro_rules! z { + () => { + Q!("x0") + }; +} +macro_rules! tbl { + () => { + Q!("x1") + }; +} +macro_rules! height { + () => { + Q!("x2") + }; +} +macro_rules! idx { + () => { + Q!("x3") + }; +} + +macro_rules! mask { + () => { + Q!("x5") + }; +} +macro_rules! cnt { + () => { + Q!("x6") + }; +} + +macro_rules! ventry0 { + () => { + Q!("v20") + }; +} +macro_rules! qentry0 { + () => { + Q!("q20") + }; +} +macro_rules! ventry1 { + () => { + Q!("v21") + }; +} +macro_rules! qentry1 { + () => { + Q!("q21") + }; +} +macro_rules! ventry2 { + () => { + Q!("v22") + }; +} +macro_rules! qentry2 { + () => { + Q!("q22") + }; +} +macro_rules! ventry3 { + () => { + Q!("v23") + }; +} +macro_rules! qentry3 { + () => { + Q!("q23") + }; +} +macro_rules! ventry4 { + () => { + Q!("v24") + }; +} +macro_rules! qentry4 { + () => { + Q!("q24") + }; +} +macro_rules! ventry5 { + () => { + Q!("v25") + }; +} +macro_rules! qentry5 { + () => { + Q!("q25") + }; +} +macro_rules! ventry6 { + () => { + Q!("v26") + }; +} +macro_rules! qentry6 { + () => { + Q!("q26") + }; +} +macro_rules! ventry7 { + () => { + Q!("v27") + }; +} +macro_rules! qentry7 { + () => { + Q!("q27") + }; +} +macro_rules! ventry8 { + () => { + Q!("v28") + }; +} +macro_rules! qentry8 { + () => { + Q!("q28") + }; +} +macro_rules! ventry9 { + () => { + Q!("v29") + }; +} +macro_rules! qentry9 { + () => { + Q!("q29") + }; +} +macro_rules! ventry10 { + () => { + Q!("v30") + }; +} +macro_rules! qentry10 { + () => { + Q!("q30") + }; +} +macro_rules! ventry11 { + () => { + Q!("v31") + }; +} +macro_rules! qentry11 { + () => { + Q!("q31") + }; +} +macro_rules! ventry12 { + () => { + Q!("v0") + }; +} +macro_rules! qentry12 { + () => { + Q!("q0") + }; +} +macro_rules! ventry13 { + () => { + Q!("v1") + }; +} +macro_rules! qentry13 { + () => { + Q!("q1") + }; +} +macro_rules! ventry14 { + () => { + Q!("v2") + }; +} +macro_rules! qentry14 { + () => { + Q!("q2") + }; +} +macro_rules! ventry15 { + () => { + Q!("v3") + }; +} +macro_rules! qentry15 { + () => { + Q!("q3") + }; +} + +macro_rules! vtmp { + () => { + Q!("v16") + }; +} +macro_rules! qtmp { + () => { + Q!("q16") + }; +} + +macro_rules! vmask { + () => { + Q!("v17") + }; +} + +pub fn bignum_copy_row_from_table_32_neon(z: &mut [u64], table: &[u64], height: u64, index: u64) { + debug_assert!(z.len() == 32); + debug_assert!(index < height); + unsafe { + core::arch::asm!( + + + // Clear accumulator + // Zeroing can be done via xor, but xor isn't formalized yet. + Q!(" dup " ventry0!() ".2d, xzr"), + Q!(" mov " ventry1!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry2!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry3!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry4!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry5!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry6!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry7!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry8!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry9!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry10!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry11!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry12!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry13!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry14!() ".16b, " ventry0!() ".16b"), + Q!(" mov " ventry15!() ".16b, " ventry0!() ".16b"), + + Q!(" mov " cnt!() ", #0"), + Q!(Label!("bignum_copy_row_from_table_32_neon_loop", 2) ":"), + + // Compute mask: Check if current index matches target index + Q!(" subs " "xzr, " cnt!() ", " idx!()), + Q!(" cinv " mask!() ", xzr, eq"), + Q!(" dup " vmask!() ".2d, " mask!()), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 0]"), + Q!(" bit " ventry0!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 1]"), + Q!(" bit " ventry1!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 2]"), + Q!(" bit " ventry2!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 3]"), + Q!(" bit " ventry3!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 4]"), + Q!(" bit " ventry4!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 5]"), + Q!(" bit " ventry5!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 6]"), + Q!(" bit " ventry6!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 7]"), + Q!(" bit " ventry7!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 8]"), + Q!(" bit " ventry8!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 9]"), + Q!(" bit " ventry9!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 10]"), + Q!(" bit " ventry10!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 11]"), + Q!(" bit " ventry11!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 12]"), + Q!(" bit " ventry12!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 13]"), + Q!(" bit " ventry13!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 14]"), + Q!(" bit " ventry14!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 15]"), + Q!(" bit " ventry15!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"), + + Q!(" add " tbl!() ", " tbl!() ", #32 * 8"), + + Q!(" add " cnt!() ", " cnt!() ", #1"), + Q!(" subs " "xzr, " height!() ", " cnt!()), + Q!(" b.ne " Label!("bignum_copy_row_from_table_32_neon_loop", 2, Before)), + + Q!(Label!("bignum_copy_row_from_table_32_neon_end", 3) ":"), + + Q!(" str " qentry0!() ", [" z!() ", #16 * 0]"), + Q!(" str " qentry1!() ", [" z!() ", #16 * 1]"), + Q!(" str " qentry2!() ", [" z!() ", #16 * 2]"), + Q!(" str " qentry3!() ", [" z!() ", #16 * 3]"), + Q!(" str " qentry4!() ", [" z!() ", #16 * 4]"), + Q!(" str " qentry5!() ", [" z!() ", #16 * 5]"), + Q!(" str " qentry6!() ", [" z!() ", #16 * 6]"), + Q!(" str " qentry7!() ", [" z!() ", #16 * 7]"), + Q!(" str " qentry8!() ", [" z!() ", #16 * 8]"), + Q!(" str " qentry9!() ", [" z!() ", #16 * 9]"), + Q!(" str " qentry10!() ", [" z!() ", #16 * 10]"), + Q!(" str " qentry11!() ", [" z!() ", #16 * 11]"), + Q!(" str " qentry12!() ", [" z!() ", #16 * 12]"), + Q!(" str " qentry13!() ", [" z!() ", #16 * 13]"), + Q!(" str " qentry14!() ", [" z!() ", #16 * 14]"), + Q!(" str " qentry15!() ", [" z!() ", #16 * 15]"), + + inout("x0") z.as_mut_ptr() => _, + inout("x1") table.as_ptr() => _, + inout("x2") height => _, + inout("x3") index => _, + // clobbers + out("v0") _, + out("v1") _, + out("v16") _, + out("v17") _, + out("v2") _, + out("v20") _, + out("v21") _, + out("v22") _, + out("v23") _, + out("v24") _, + out("v25") _, + out("v26") _, + out("v27") _, + out("v28") _, + out("v29") _, + out("v3") _, + out("v30") _, + out("v31") _, + out("x5") _, + out("x6") _, + ) + }; +} diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs new file mode 100644 index 00000000..91b559c5 --- /dev/null +++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs @@ -0,0 +1,158 @@ +#![allow(non_upper_case_globals, unused_macros, unused_imports)] +use crate::low::macros::*; + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] +// into z[0..width-1]. width must be a multiple of 8. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// +// extern void bignum_copy_from_table_8_neon +// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, uint64_t idx); +// +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx +// ---------------------------------------------------------------------------- + +macro_rules! z { + () => { + Q!("x0") + }; +} +macro_rules! table { + () => { + Q!("x1") + }; +} +macro_rules! height { + () => { + Q!("x2") + }; +} +macro_rules! width { + () => { + Q!("x3") + }; +} +macro_rules! idx { + () => { + Q!("x4") + }; +} + +macro_rules! i { + () => { + Q!("x5") + }; +} +macro_rules! mask { + () => { + Q!("x6") + }; +} +macro_rules! j { + () => { + Q!("x7") + }; +} + +macro_rules! vmask { + () => { + Q!("v16") + }; +} + +pub fn bignum_copy_row_from_table_8n_neon( + z: &mut [u64], + table: &[u64], + height: u64, + width: u64, + index: u64, +) { + debug_assert!(z.len() as u64 == width); + debug_assert!(width % 8 == 0); + debug_assert!(index < height); + unsafe { + core::arch::asm!( + + + Q!(" cbz " height!() ", " Label!("bignum_copy_row_from_table_8n_neon_end", 2, After)), + Q!(" cbz " width!() ", " Label!("bignum_copy_row_from_table_8n_neon_end", 2, After)), + Q!(" mov " i!() ", " width!()), + Q!(" mov " "x6, " z!()), + Q!(" dup " "v16.2d, xzr"), + + Q!(Label!("bignum_copy_row_from_table_8n_neon_initzero", 3) ":"), + Q!(" str " "q16, [x6]"), + Q!(" str " "q16, [x6, #16]"), + Q!(" str " "q16, [x6, #32]"), + Q!(" str " "q16, [x6, #48]"), + Q!(" add " "x6, x6, #64"), + Q!(" subs " i!() ", " i!() ", #8"), + Q!(" bne " Label!("bignum_copy_row_from_table_8n_neon_initzero", 3, Before)), + + Q!(" mov " i!() ", xzr"), + Q!(" mov " "x8, " table!()), + + Q!(Label!("bignum_copy_row_from_table_8n_neon_outerloop", 4) ":"), + + Q!(" cmp " i!() ", " idx!()), + Q!(" csetm " mask!() ", eq"), + Q!(" dup " vmask!() ".2d, " mask!()), + + Q!(" mov " j!() ", " width!()), + Q!(" mov " "x9, " z!()), + + Q!(Label!("bignum_copy_row_from_table_8n_neon_innerloop", 5) ":"), + + Q!(" ldr " "q17, [x8]"), + Q!(" ldr " "q18, [x9]"), + Q!(" bit " "v18.16b, v17.16b, " vmask!() ".16b"), + Q!(" str " "q18, [x9]"), + + Q!(" ldr " "q17, [x8, #16]"), + Q!(" ldr " "q18, [x9, #16]"), + Q!(" bit " "v18.16b, v17.16b, " vmask!() ".16b"), + Q!(" str " "q18, [x9, #16]"), + + Q!(" ldr " "q17, [x8, #32]"), + Q!(" ldr " "q18, [x9, #32]"), + Q!(" bit " "v18.16b, v17.16b, " vmask!() ".16b"), + Q!(" str " "q18, [x9, #32]"), + + Q!(" ldr " "q17, [x8, #48]"), + Q!(" ldr " "q18, [x9, #48]"), + Q!(" bit " "v18.16b, v17.16b, " vmask!() ".16b"), + Q!(" str " "q18, [x9, #48]"), + + Q!(" add " "x8, x8, #64"), + Q!(" add " "x9, x9, #64"), + Q!(" subs " j!() ", " j!() ", #8"), + Q!(" bne " Label!("bignum_copy_row_from_table_8n_neon_innerloop", 5, Before)), + + Q!(Label!("bignum_copy_row_from_table_8n_neon_innerloop_done", 6) ":"), + Q!(" add " i!() ", " i!() ", #1"), + Q!(" cmp " i!() ", " height!()), + Q!(" bne " Label!("bignum_copy_row_from_table_8n_neon_outerloop", 4, Before)), + + Q!(Label!("bignum_copy_row_from_table_8n_neon_end", 2) ":"), + inout("x0") z.as_mut_ptr() => _, + inout("x1") table.as_ptr() => _, + inout("x2") height => _, + inout("x3") width => _, + inout("x4") index => _, + // clobbers + out("v16") _, + out("v17") _, + out("v18") _, + out("x5") _, + out("x6") _, + out("x7") _, + out("x8") _, + out("x9") _, + ) + }; +} diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs new file mode 100644 index 00000000..f14f0471 --- /dev/null +++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs @@ -0,0 +1,29 @@ +// Written for Graviola by Joe Birr-Pixton, 2024. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +/// Multiplex between NEON specialisations of `bignum_copy_row_from_table` +#[inline] +pub fn bignum_copy_row_from_table( + z: &mut [u64], + table: &[u64], + height: u64, + width: u64, + index: u64, +) { + match width { + 32 => super::bignum_copy_row_from_table_32_neon::bignum_copy_row_from_table_32_neon( + z, table, height, index, + ), + 16 => super::bignum_copy_row_from_table_16_neon::bignum_copy_row_from_table_16_neon( + z, table, height, index, + ), + width if width % 8 == 0 => { + super::bignum_copy_row_from_table_8n_neon::bignum_copy_row_from_table_8n_neon( + z, table, height, width, index, + ) + } + width => super::bignum_copy_row_from_table::bignum_copy_row_from_table( + z, table, height, width, index, + ), + } +} diff --git a/graviola/src/low/aarch64/mod.rs b/graviola/src/low/aarch64/mod.rs index ace2d1f6..7873a622 100644 --- a/graviola/src/low/aarch64/mod.rs +++ b/graviola/src/low/aarch64/mod.rs @@ -8,6 +8,10 @@ pub(crate) mod bignum_add_p384; pub(crate) mod bignum_bitsize; pub(crate) mod bignum_cmp_lt; pub(crate) mod bignum_copy_row_from_table; +pub(crate) mod bignum_copy_row_from_table_16_neon; +pub(crate) mod bignum_copy_row_from_table_32_neon; +pub(crate) mod bignum_copy_row_from_table_8n_neon; +pub(crate) mod bignum_copy_row_from_table_mux; pub(crate) mod bignum_demont; pub(crate) mod bignum_demont_p256; pub(crate) mod bignum_demont_p384; diff --git a/graviola/src/low/mod.rs b/graviola/src/low/mod.rs index e0d47d46..7d5016fa 100644 --- a/graviola/src/low/mod.rs +++ b/graviola/src/low/mod.rs @@ -98,7 +98,7 @@ cfg_if::cfg_if! { pub(crate) use aarch64::bignum_add_p384::bignum_add_p384; pub(crate) use aarch64::bignum_bitsize::bignum_bitsize; pub(crate) use aarch64::bignum_cmp_lt::bignum_cmp_lt; - pub(crate) use aarch64::bignum_copy_row_from_table::bignum_copy_row_from_table; + pub(crate) use aarch64::bignum_copy_row_from_table_mux::bignum_copy_row_from_table; pub(crate) use aarch64::bignum_point_select_p256::{bignum_aff_point_select_p256, bignum_jac_point_select_p256}; pub(crate) use aarch64::bignum_point_select_p384::bignum_jac_point_select_p384; pub(crate) use aarch64::bignum_demont::bignum_demont;