Skip to content

Commit

Permalink
Use neon bignum_copy_row_from_table specialisations
Browse files Browse the repository at this point in the history
  • Loading branch information
ctz committed Sep 28, 2024
1 parent f7a3d54 commit ccf5e9f
Show file tree
Hide file tree
Showing 9 changed files with 900 additions and 2 deletions.
68 changes: 68 additions & 0 deletions admin/parse-asm/driver.p256.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,74 @@
)
parse_file(input, d)

with open(
"../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S"
) as input, open(
"../../graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs", "w"
) as output:
d = RustDriver(output, Architecture_aarch64)
d.emit_rust_function(
"bignum_copy_row_from_table_8n_neon",
parameter_map=[
("inout", "x0", "z.as_mut_ptr() => _"),
("inout", "x1", "table.as_ptr() => _"),
("inout", "x2", "height => _"),
("inout", "x3", "width => _"),
("inout", "x4", "index => _"),
],
rust_decl="pub fn bignum_copy_row_from_table_8n_neon(z: &mut [u64], table: &[u64], height: u64, width: u64, index: u64)",
assertions=[
"z.len() as u64 == width",
"width % 8 == 0",
"index < height",
],
)
parse_file(input, d)

with open(
"../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S"
) as input, open(
"../../graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs", "w"
) as output:
d = RustDriver(output, Architecture_aarch64)
d.emit_rust_function(
"bignum_copy_row_from_table_16_neon",
parameter_map=[
("inout", "x0", "z.as_mut_ptr() => _"),
("inout", "x1", "table.as_ptr() => _"),
("inout", "x2", "height => _"),
("inout", "x3", "index => _"),
],
rust_decl="pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)",
assertions=[
"z.len() == 16",
"index < height",
],
)
parse_file(input, d)

with open(
"../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S"
) as input, open(
"../../graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs", "w"
) as output:
d = RustDriver(output, Architecture_aarch64)
d.emit_rust_function(
"bignum_copy_row_from_table_32_neon",
parameter_map=[
("inout", "x0", "z.as_mut_ptr() => _"),
("inout", "x1", "table.as_ptr() => _"),
("inout", "x2", "height => _"),
("inout", "x3", "index => _"),
],
rust_decl="pub fn bignum_copy_row_from_table_32_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)",
assertions=[
"z.len() == 32",
"index < height",
],
)
parse_file(input, d)

with open("../../thirdparty/s2n-bignum/arm/p256/p256_montjadd.S") as input, open(
"../../graviola/src/low/aarch64/p256_montjadd.rs", "w"
) as output:
Expand Down
11 changes: 10 additions & 1 deletion admin/parse-asm/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from io import StringIO
import copy

from parse import Type, tokenise, is_comment
from parse import Type, register_from_token, tokenise, is_comment


class Architecture:
Expand Down Expand Up @@ -553,6 +553,8 @@ def contains_constant_ref(self, *values):
def expand_rust_macros(self, *values, params={}):
for v in values:
for t in tokenise(v):
r = register_from_token(t)

if t in params:
yield unquote("$" + t)
elif t in self.rust_macros:
Expand All @@ -564,6 +566,13 @@ def expand_rust_macros(self, *values, params={}):
# must not have any arguments
assert macro_args == None
yield unquote("%s!()" % t)
elif r is not None and r.reg in self.rust_macros:
macro_value, macro_args = self.rust_macros[r.reg]
for vv in macro_value:
self.visit_operands(vv)
assert macro_args == None
yield unquote("%s!()" % r.reg)
yield r.suffix
elif is_comment(t):
yield unquote(t)
elif t in self.constant_syms:
Expand Down
15 changes: 15 additions & 0 deletions admin/parse-asm/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import string
import subprocess
from collections import namedtuple
from os import path
import io

Expand Down Expand Up @@ -163,6 +164,20 @@ def is_comment(s):
return s.startswith("/*") and s.endswith("*/")


register = namedtuple("Register", "reg suffix")


def register_from_token(t):
# fix up registers that are tokenised as one, but
# need to be treated as two for macro expansion,
# eg `ventry4.16b` needs to be split into `ventry4`
# (`ventry4` being a macro name) and `.16b` (neon
# width spec)
if t.count(".") == 1:
idx = t.find(".")
return register(t[:idx], t[idx:])


def tokenise(s):
def tokenise_gen(s):
symbol = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*")
Expand Down
246 changes: 246 additions & 0 deletions graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#![allow(non_upper_case_globals, unused_macros, unused_imports)]
use crate::low::macros::*;

// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1]
// into z[0..row-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table_16_neon
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
//
// Initial version written by Hanno Becker
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
// ----------------------------------------------------------------------------

// *****************************************************
// Main code
// *****************************************************

macro_rules! z {
() => {
Q!("x0")
};
}
macro_rules! tbl {
() => {
Q!("x1")
};
}
macro_rules! height {
() => {
Q!("x2")
};
}
macro_rules! idx {
() => {
Q!("x3")
};
}

macro_rules! mask {
() => {
Q!("x5")
};
}
macro_rules! cnt {
() => {
Q!("x6")
};
}

macro_rules! ventry0 {
() => {
Q!("v20")
};
}
macro_rules! qentry0 {
() => {
Q!("q20")
};
}
macro_rules! ventry1 {
() => {
Q!("v21")
};
}
macro_rules! qentry1 {
() => {
Q!("q21")
};
}
macro_rules! ventry2 {
() => {
Q!("v22")
};
}
macro_rules! qentry2 {
() => {
Q!("q22")
};
}
macro_rules! ventry3 {
() => {
Q!("v23")
};
}
macro_rules! qentry3 {
() => {
Q!("q23")
};
}
macro_rules! ventry4 {
() => {
Q!("v24")
};
}
macro_rules! qentry4 {
() => {
Q!("q24")
};
}
macro_rules! ventry5 {
() => {
Q!("v25")
};
}
macro_rules! qentry5 {
() => {
Q!("q25")
};
}
macro_rules! ventry6 {
() => {
Q!("v26")
};
}
macro_rules! qentry6 {
() => {
Q!("q26")
};
}
macro_rules! ventry7 {
() => {
Q!("v27")
};
}
macro_rules! qentry7 {
() => {
Q!("q27")
};
}
macro_rules! ventry8 {
() => {
Q!("v28")
};
}

macro_rules! vtmp {
() => {
Q!("v16")
};
}
macro_rules! qtmp {
() => {
Q!("q16")
};
}

macro_rules! vmask {
() => {
Q!("v17")
};
}

pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64) {
debug_assert!(z.len() == 16);
debug_assert!(index < height);
unsafe {
core::arch::asm!(


// Clear accumulator
// Zeroing can be done via xor, but xor isn't formalized yet.
Q!(" dup " ventry0!() ".2d, xzr"),
Q!(" mov " ventry1!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry2!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry3!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry4!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry5!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry6!() ".16b, " ventry0!() ".16b"),
Q!(" mov " ventry7!() ".16b, " ventry0!() ".16b"),

Q!(" mov " cnt!() ", #0"),
Q!(Label!("bignum_copy_row_from_table_16_neon_loop", 2) ":"),

// Compute mask: Check if current index matches target index
Q!(" subs " "xzr, " cnt!() ", " idx!()),
Q!(" cinv " mask!() ", xzr, eq"),
Q!(" dup " vmask!() ".2d, " mask!()),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 0]"),
Q!(" bit " ventry0!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 1]"),
Q!(" bit " ventry1!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 2]"),
Q!(" bit " ventry2!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 3]"),
Q!(" bit " ventry3!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 4]"),
Q!(" bit " ventry4!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 5]"),
Q!(" bit " ventry5!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 6]"),
Q!(" bit " ventry6!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" ldr " qtmp!() ", [" tbl!() ", #16 * 7]"),
Q!(" bit " ventry7!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),

Q!(" add " tbl!() ", " tbl!() ", #16 * 8"),

Q!(" add " cnt!() ", " cnt!() ", #1"),
Q!(" subs " "xzr, " height!() ", " cnt!()),
Q!(" b.ne " Label!("bignum_copy_row_from_table_16_neon_loop", 2, Before)),

Q!(Label!("bignum_copy_row_from_table_16_neon_end", 3) ":"),

Q!(" str " qentry0!() ", [" z!() ", #16 * 0]"),
Q!(" str " qentry1!() ", [" z!() ", #16 * 1]"),
Q!(" str " qentry2!() ", [" z!() ", #16 * 2]"),
Q!(" str " qentry3!() ", [" z!() ", #16 * 3]"),
Q!(" str " qentry4!() ", [" z!() ", #16 * 4]"),
Q!(" str " qentry5!() ", [" z!() ", #16 * 5]"),
Q!(" str " qentry6!() ", [" z!() ", #16 * 6]"),
Q!(" str " qentry7!() ", [" z!() ", #16 * 7]"),

inout("x0") z.as_mut_ptr() => _,
inout("x1") table.as_ptr() => _,
inout("x2") height => _,
inout("x3") index => _,
// clobbers
out("v16") _,
out("v17") _,
out("v20") _,
out("v21") _,
out("v22") _,
out("v23") _,
out("v24") _,
out("v25") _,
out("v26") _,
out("v27") _,
out("x5") _,
out("x6") _,
)
};
}
Loading

0 comments on commit ccf5e9f

Please sign in to comment.