diff --git a/admin/parse-asm/driver.p256.py b/admin/parse-asm/driver.p256.py
index 4e3f72e2..4817d12e 100644
--- a/admin/parse-asm/driver.p256.py
+++ b/admin/parse-asm/driver.p256.py
@@ -511,6 +511,74 @@
         )
         parse_file(input, d)
 
+    with open(
+        "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S"
+    ) as input, open(
+        "../../graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs", "w"
+    ) as output:
+        d = RustDriver(output, Architecture_aarch64)
+        d.emit_rust_function(
+            "bignum_copy_row_from_table_8n_neon",
+            parameter_map=[
+                ("inout", "x0", "z.as_mut_ptr() => _"),
+                ("inout", "x1", "table.as_ptr() => _"),
+                ("inout", "x2", "height => _"),
+                ("inout", "x3", "width => _"),
+                ("inout", "x4", "index => _"),
+            ],
+            rust_decl="pub fn bignum_copy_row_from_table_8n_neon(z: &mut [u64], table: &[u64], height: u64, width: u64, index: u64)",
+            assertions=[
+                "z.len() as u64 == width",
+                "width % 8 == 0",
+                "index < height",
+            ],
+        )
+        parse_file(input, d)
+
+    with open(
+        "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S"
+    ) as input, open(
+        "../../graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs", "w"
+    ) as output:
+        d = RustDriver(output, Architecture_aarch64)
+        d.emit_rust_function(
+            "bignum_copy_row_from_table_16_neon",
+            parameter_map=[
+                ("inout", "x0", "z.as_mut_ptr() => _"),
+                ("inout", "x1", "table.as_ptr() => _"),
+                ("inout", "x2", "height => _"),
+                ("inout", "x3", "index => _"),
+            ],
+            rust_decl="pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)",
+            assertions=[
+                "z.len() == 16",
+                "index < height",
+            ],
+        )
+        parse_file(input, d)
+
+    with open(
+        "../../thirdparty/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S"
+    ) as input, open(
+        "../../graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs", "w"
+    ) as output:
+        d = RustDriver(output, Architecture_aarch64)
+        d.emit_rust_function(
+            "bignum_copy_row_from_table_32_neon",
+            parameter_map=[
+                ("inout", "x0", "z.as_mut_ptr() => _"),
+                ("inout", "x1", "table.as_ptr() => _"),
+                ("inout", "x2", "height => _"),
+                ("inout", "x3", "index => _"),
+            ],
+            rust_decl="pub fn bignum_copy_row_from_table_32_neon(z: &mut [u64], table: &[u64], height: u64, index: u64)",
+            assertions=[
+                "z.len() == 32",
+                "index < height",
+            ],
+        )
+        parse_file(input, d)
+
     with open("../../thirdparty/s2n-bignum/arm/p256/p256_montjadd.S") as input, open(
         "../../graviola/src/low/aarch64/p256_montjadd.rs", "w"
     ) as output:
diff --git a/admin/parse-asm/driver.py b/admin/parse-asm/driver.py
index 52be8c62..9083ab0b 100644
--- a/admin/parse-asm/driver.py
+++ b/admin/parse-asm/driver.py
@@ -4,7 +4,7 @@
 from io import StringIO
 import copy
 
-from parse import Type, tokenise, is_comment
+from parse import Type, register_from_token, tokenise, is_comment
 
 
 class Architecture:
@@ -553,6 +553,8 @@ def contains_constant_ref(self, *values):
     def expand_rust_macros(self, *values, params={}):
         for v in values:
             for t in tokenise(v):
+                r = register_from_token(t)
+
                 if t in params:
                     yield unquote("$" + t)
                 elif t in self.rust_macros:
@@ -564,6 +566,13 @@ def expand_rust_macros(self, *values, params={}):
                     # must not have any arguments
                     assert macro_args == None
                     yield unquote("%s!()" % t)
+                elif r is not None and r.reg in self.rust_macros:
+                    macro_value, macro_args = self.rust_macros[r.reg]
+                    for vv in macro_value:
+                        self.visit_operands(vv)
+                    assert macro_args == None
+                    yield unquote("%s!()" % r.reg)
+                    yield r.suffix
                 elif is_comment(t):
                     yield unquote(t)
                 elif t in self.constant_syms:
diff --git a/admin/parse-asm/parse.py b/admin/parse-asm/parse.py
index cb9a2d95..ab07f2a0 100644
--- a/admin/parse-asm/parse.py
+++ b/admin/parse-asm/parse.py
@@ -3,6 +3,7 @@
 import glob
 import string
 import subprocess
+from collections import namedtuple
 from os import path
 import io
 
@@ -163,6 +164,20 @@ def is_comment(s):
     return s.startswith("/*") and s.endswith("*/")
 
 
+register = namedtuple("Register", "reg suffix")
+
+
+def register_from_token(t):
+    # fix up registers that are tokenised as one, but
+    # need to be treated as two for macro expansion,
+    # eg `ventry4.16b` needs to be split into `ventry4`
+    # (`ventry4` being a macro name) and `.16b` (neon
+    # width spec)
+    if t.count(".") == 1:
+        idx = t.find(".")
+        return register(t[:idx], t[idx:])
+
+
 def tokenise(s):
     def tokenise_gen(s):
         symbol = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*")
diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs
new file mode 100644
index 00000000..dc606907
--- /dev/null
+++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_16_neon.rs
@@ -0,0 +1,246 @@
+#![allow(non_upper_case_globals, unused_macros, unused_imports)]
+use crate::low::macros::*;
+
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1]
+// into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+//
+//    extern void bignum_copy_from_table_16_neon
+//     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
+//
+// Initial version written by Hanno Becker
+// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
+// ----------------------------------------------------------------------------
+
+// *****************************************************
+// Main code
+// *****************************************************
+
+macro_rules! z {
+    () => {
+        Q!("x0")
+    };
+}
+macro_rules! tbl {
+    () => {
+        Q!("x1")
+    };
+}
+macro_rules! height {
+    () => {
+        Q!("x2")
+    };
+}
+macro_rules! idx {
+    () => {
+        Q!("x3")
+    };
+}
+
+macro_rules! mask {
+    () => {
+        Q!("x5")
+    };
+}
+macro_rules! cnt {
+    () => {
+        Q!("x6")
+    };
+}
+
+macro_rules! ventry0 {
+    () => {
+        Q!("v20")
+    };
+}
+macro_rules! qentry0 {
+    () => {
+        Q!("q20")
+    };
+}
+macro_rules! ventry1 {
+    () => {
+        Q!("v21")
+    };
+}
+macro_rules! qentry1 {
+    () => {
+        Q!("q21")
+    };
+}
+macro_rules! ventry2 {
+    () => {
+        Q!("v22")
+    };
+}
+macro_rules! qentry2 {
+    () => {
+        Q!("q22")
+    };
+}
+macro_rules! ventry3 {
+    () => {
+        Q!("v23")
+    };
+}
+macro_rules! qentry3 {
+    () => {
+        Q!("q23")
+    };
+}
+macro_rules! ventry4 {
+    () => {
+        Q!("v24")
+    };
+}
+macro_rules! qentry4 {
+    () => {
+        Q!("q24")
+    };
+}
+macro_rules! ventry5 {
+    () => {
+        Q!("v25")
+    };
+}
+macro_rules! qentry5 {
+    () => {
+        Q!("q25")
+    };
+}
+macro_rules! ventry6 {
+    () => {
+        Q!("v26")
+    };
+}
+macro_rules! qentry6 {
+    () => {
+        Q!("q26")
+    };
+}
+macro_rules! ventry7 {
+    () => {
+        Q!("v27")
+    };
+}
+macro_rules! qentry7 {
+    () => {
+        Q!("q27")
+    };
+}
+macro_rules! ventry8 {
+    () => {
+        Q!("v28")
+    };
+}
+
+macro_rules! vtmp {
+    () => {
+        Q!("v16")
+    };
+}
+macro_rules! qtmp {
+    () => {
+        Q!("q16")
+    };
+}
+
+macro_rules! vmask {
+    () => {
+        Q!("v17")
+    };
+}
+
+pub fn bignum_copy_row_from_table_16_neon(z: &mut [u64], table: &[u64], height: u64, index: u64) {
+    debug_assert!(z.len() == 16);
+    debug_assert!(index < height);
+    unsafe {
+        core::arch::asm!(
+
+
+        // Clear accumulator
+        // Zeroing can be done via xor, but xor isn't formalized yet.
+        Q!("    dup             " ventry0!() ".2d, xzr"),
+        Q!("    mov             " ventry1!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry2!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry3!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry4!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry5!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry6!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry7!() ".16b, " ventry0!() ".16b"),
+
+        Q!("    mov             " cnt!() ", #0"),
+        Q!(Label!("bignum_copy_row_from_table_16_neon_loop", 2) ":"),
+
+        // Compute mask: Check if current index matches target index
+        Q!("    subs            " "xzr, " cnt!() ", " idx!()),
+        Q!("    cinv            " mask!() ", xzr, eq"),
+        Q!("    dup             " vmask!() ".2d, " mask!()),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 0]"),
+        Q!("    bit             " ventry0!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 1]"),
+        Q!("    bit             " ventry1!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 2]"),
+        Q!("    bit             " ventry2!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 3]"),
+        Q!("    bit             " ventry3!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 4]"),
+        Q!("    bit             " ventry4!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 5]"),
+        Q!("    bit             " ventry5!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 6]"),
+        Q!("    bit             " ventry6!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 7]"),
+        Q!("    bit             " ventry7!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    add             " tbl!() ", " tbl!() ", #16 * 8"),
+
+        Q!("    add             " cnt!() ", " cnt!() ", #1"),
+        Q!("    subs            " "xzr, " height!() ", " cnt!()),
+        Q!("    b.ne            " Label!("bignum_copy_row_from_table_16_neon_loop", 2, Before)),
+
+        Q!(Label!("bignum_copy_row_from_table_16_neon_end", 3) ":"),
+
+        Q!("    str             " qentry0!() ", [" z!() ", #16 * 0]"),
+        Q!("    str             " qentry1!() ", [" z!() ", #16 * 1]"),
+        Q!("    str             " qentry2!() ", [" z!() ", #16 * 2]"),
+        Q!("    str             " qentry3!() ", [" z!() ", #16 * 3]"),
+        Q!("    str             " qentry4!() ", [" z!() ", #16 * 4]"),
+        Q!("    str             " qentry5!() ", [" z!() ", #16 * 5]"),
+        Q!("    str             " qentry6!() ", [" z!() ", #16 * 6]"),
+        Q!("    str             " qentry7!() ", [" z!() ", #16 * 7]"),
+
+        inout("x0") z.as_mut_ptr() => _,
+        inout("x1") table.as_ptr() => _,
+        inout("x2") height => _,
+        inout("x3") index => _,
+        // clobbers
+        out("v16") _,
+        out("v17") _,
+        out("v20") _,
+        out("v21") _,
+        out("v22") _,
+        out("v23") _,
+        out("v24") _,
+        out("v25") _,
+        out("v26") _,
+        out("v27") _,
+        out("x5") _,
+        out("x6") _,
+            )
+    };
+}
diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs
new file mode 100644
index 00000000..c875ca9e
--- /dev/null
+++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_32_neon.rs
@@ -0,0 +1,369 @@
+#![allow(non_upper_case_globals, unused_macros, unused_imports)]
+use crate::low::macros::*;
+
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1]
+// into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+//
+//    extern void bignum_copy_from_table_32_neon
+//     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
+//
+// Initial version written by Hanno Becker
+// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
+// ----------------------------------------------------------------------------
+
+// *****************************************************
+// Main code
+// *****************************************************
+
+macro_rules! z {
+    () => {
+        Q!("x0")
+    };
+}
+macro_rules! tbl {
+    () => {
+        Q!("x1")
+    };
+}
+macro_rules! height {
+    () => {
+        Q!("x2")
+    };
+}
+macro_rules! idx {
+    () => {
+        Q!("x3")
+    };
+}
+
+macro_rules! mask {
+    () => {
+        Q!("x5")
+    };
+}
+macro_rules! cnt {
+    () => {
+        Q!("x6")
+    };
+}
+
+macro_rules! ventry0 {
+    () => {
+        Q!("v20")
+    };
+}
+macro_rules! qentry0 {
+    () => {
+        Q!("q20")
+    };
+}
+macro_rules! ventry1 {
+    () => {
+        Q!("v21")
+    };
+}
+macro_rules! qentry1 {
+    () => {
+        Q!("q21")
+    };
+}
+macro_rules! ventry2 {
+    () => {
+        Q!("v22")
+    };
+}
+macro_rules! qentry2 {
+    () => {
+        Q!("q22")
+    };
+}
+macro_rules! ventry3 {
+    () => {
+        Q!("v23")
+    };
+}
+macro_rules! qentry3 {
+    () => {
+        Q!("q23")
+    };
+}
+macro_rules! ventry4 {
+    () => {
+        Q!("v24")
+    };
+}
+macro_rules! qentry4 {
+    () => {
+        Q!("q24")
+    };
+}
+macro_rules! ventry5 {
+    () => {
+        Q!("v25")
+    };
+}
+macro_rules! qentry5 {
+    () => {
+        Q!("q25")
+    };
+}
+macro_rules! ventry6 {
+    () => {
+        Q!("v26")
+    };
+}
+macro_rules! qentry6 {
+    () => {
+        Q!("q26")
+    };
+}
+macro_rules! ventry7 {
+    () => {
+        Q!("v27")
+    };
+}
+macro_rules! qentry7 {
+    () => {
+        Q!("q27")
+    };
+}
+macro_rules! ventry8 {
+    () => {
+        Q!("v28")
+    };
+}
+macro_rules! qentry8 {
+    () => {
+        Q!("q28")
+    };
+}
+macro_rules! ventry9 {
+    () => {
+        Q!("v29")
+    };
+}
+macro_rules! qentry9 {
+    () => {
+        Q!("q29")
+    };
+}
+macro_rules! ventry10 {
+    () => {
+        Q!("v30")
+    };
+}
+macro_rules! qentry10 {
+    () => {
+        Q!("q30")
+    };
+}
+macro_rules! ventry11 {
+    () => {
+        Q!("v31")
+    };
+}
+macro_rules! qentry11 {
+    () => {
+        Q!("q31")
+    };
+}
+macro_rules! ventry12 {
+    () => {
+        Q!("v0")
+    };
+}
+macro_rules! qentry12 {
+    () => {
+        Q!("q0")
+    };
+}
+macro_rules! ventry13 {
+    () => {
+        Q!("v1")
+    };
+}
+macro_rules! qentry13 {
+    () => {
+        Q!("q1")
+    };
+}
+macro_rules! ventry14 {
+    () => {
+        Q!("v2")
+    };
+}
+macro_rules! qentry14 {
+    () => {
+        Q!("q2")
+    };
+}
+macro_rules! ventry15 {
+    () => {
+        Q!("v3")
+    };
+}
+macro_rules! qentry15 {
+    () => {
+        Q!("q3")
+    };
+}
+
+macro_rules! vtmp {
+    () => {
+        Q!("v16")
+    };
+}
+macro_rules! qtmp {
+    () => {
+        Q!("q16")
+    };
+}
+
+macro_rules! vmask {
+    () => {
+        Q!("v17")
+    };
+}
+
+pub fn bignum_copy_row_from_table_32_neon(z: &mut [u64], table: &[u64], height: u64, index: u64) {
+    debug_assert!(z.len() == 32);
+    debug_assert!(index < height);
+    unsafe {
+        core::arch::asm!(
+
+
+        // Clear accumulator
+        // Zeroing can be done via xor, but xor isn't formalized yet.
+        Q!("    dup             " ventry0!() ".2d, xzr"),
+        Q!("    mov             " ventry1!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry2!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry3!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry4!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry5!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry6!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry7!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry8!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry9!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry10!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry11!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry12!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry13!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry14!() ".16b, " ventry0!() ".16b"),
+        Q!("    mov             " ventry15!() ".16b, " ventry0!() ".16b"),
+
+        Q!("    mov             " cnt!() ", #0"),
+        Q!(Label!("bignum_copy_row_from_table_32_neon_loop", 2) ":"),
+
+        // Compute mask: Check if current index matches target index
+        Q!("    subs            " "xzr, " cnt!() ", " idx!()),
+        Q!("    cinv            " mask!() ", xzr, eq"),
+        Q!("    dup             " vmask!() ".2d, " mask!()),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 0]"),
+        Q!("    bit             " ventry0!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 1]"),
+        Q!("    bit             " ventry1!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 2]"),
+        Q!("    bit             " ventry2!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 3]"),
+        Q!("    bit             " ventry3!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 4]"),
+        Q!("    bit             " ventry4!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 5]"),
+        Q!("    bit             " ventry5!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 6]"),
+        Q!("    bit             " ventry6!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 7]"),
+        Q!("    bit             " ventry7!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 8]"),
+        Q!("    bit             " ventry8!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 9]"),
+        Q!("    bit             " ventry9!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 10]"),
+        Q!("    bit             " ventry10!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 11]"),
+        Q!("    bit             " ventry11!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 12]"),
+        Q!("    bit             " ventry12!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 13]"),
+        Q!("    bit             " ventry13!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 14]"),
+        Q!("    bit             " ventry14!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    ldr             " qtmp!() ", [" tbl!() ", #16 * 15]"),
+        Q!("    bit             " ventry15!() ".16b, " vtmp!() ".16b, " vmask!() ".16b"),
+
+        Q!("    add             " tbl!() ", " tbl!() ", #32 * 8"),
+
+        Q!("    add             " cnt!() ", " cnt!() ", #1"),
+        Q!("    subs            " "xzr, " height!() ", " cnt!()),
+        Q!("    b.ne            " Label!("bignum_copy_row_from_table_32_neon_loop", 2, Before)),
+
+        Q!(Label!("bignum_copy_row_from_table_32_neon_end", 3) ":"),
+
+        Q!("    str             " qentry0!() ", [" z!() ", #16 * 0]"),
+        Q!("    str             " qentry1!() ", [" z!() ", #16 * 1]"),
+        Q!("    str             " qentry2!() ", [" z!() ", #16 * 2]"),
+        Q!("    str             " qentry3!() ", [" z!() ", #16 * 3]"),
+        Q!("    str             " qentry4!() ", [" z!() ", #16 * 4]"),
+        Q!("    str             " qentry5!() ", [" z!() ", #16 * 5]"),
+        Q!("    str             " qentry6!() ", [" z!() ", #16 * 6]"),
+        Q!("    str             " qentry7!() ", [" z!() ", #16 * 7]"),
+        Q!("    str             " qentry8!() ", [" z!() ", #16 * 8]"),
+        Q!("    str             " qentry9!() ", [" z!() ", #16 * 9]"),
+        Q!("    str             " qentry10!() ", [" z!() ", #16 * 10]"),
+        Q!("    str             " qentry11!() ", [" z!() ", #16 * 11]"),
+        Q!("    str             " qentry12!() ", [" z!() ", #16 * 12]"),
+        Q!("    str             " qentry13!() ", [" z!() ", #16 * 13]"),
+        Q!("    str             " qentry14!() ", [" z!() ", #16 * 14]"),
+        Q!("    str             " qentry15!() ", [" z!() ", #16 * 15]"),
+
+        inout("x0") z.as_mut_ptr() => _,
+        inout("x1") table.as_ptr() => _,
+        inout("x2") height => _,
+        inout("x3") index => _,
+        // clobbers
+        out("v0") _,
+        out("v1") _,
+        out("v16") _,
+        out("v17") _,
+        out("v2") _,
+        out("v20") _,
+        out("v21") _,
+        out("v22") _,
+        out("v23") _,
+        out("v24") _,
+        out("v25") _,
+        out("v26") _,
+        out("v27") _,
+        out("v28") _,
+        out("v29") _,
+        out("v3") _,
+        out("v30") _,
+        out("v31") _,
+        out("x5") _,
+        out("x6") _,
+            )
+    };
+}
diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs
new file mode 100644
index 00000000..91b559c5
--- /dev/null
+++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_8n_neon.rs
@@ -0,0 +1,158 @@
+#![allow(non_upper_case_globals, unused_macros, unused_imports)]
+use crate::low::macros::*;
+
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1]. width must be a multiple of 8.
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+//
+//    extern void bignum_copy_from_table_8_neon
+//     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, uint64_t idx);
+//
+// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx
+// ----------------------------------------------------------------------------
+
+macro_rules! z {
+    () => {
+        Q!("x0")
+    };
+}
+macro_rules! table {
+    () => {
+        Q!("x1")
+    };
+}
+macro_rules! height {
+    () => {
+        Q!("x2")
+    };
+}
+macro_rules! width {
+    () => {
+        Q!("x3")
+    };
+}
+macro_rules! idx {
+    () => {
+        Q!("x4")
+    };
+}
+
+macro_rules! i {
+    () => {
+        Q!("x5")
+    };
+}
+macro_rules! mask {
+    () => {
+        Q!("x6")
+    };
+}
+macro_rules! j {
+    () => {
+        Q!("x7")
+    };
+}
+
+macro_rules! vmask {
+    () => {
+        Q!("v16")
+    };
+}
+
+pub fn bignum_copy_row_from_table_8n_neon(
+    z: &mut [u64],
+    table: &[u64],
+    height: u64,
+    width: u64,
+    index: u64,
+) {
+    debug_assert!(z.len() as u64 == width);
+    debug_assert!(width % 8 == 0);
+    debug_assert!(index < height);
+    unsafe {
+        core::arch::asm!(
+
+
+        Q!("    cbz             " height!() ", " Label!("bignum_copy_row_from_table_8n_neon_end", 2, After)),
+        Q!("    cbz             " width!() ", " Label!("bignum_copy_row_from_table_8n_neon_end", 2, After)),
+        Q!("    mov             " i!() ", " width!()),
+        Q!("    mov             " "x6, " z!()),
+        Q!("    dup             " "v16.2d, xzr"),
+
+        Q!(Label!("bignum_copy_row_from_table_8n_neon_initzero", 3) ":"),
+        Q!("    str             " "q16, [x6]"),
+        Q!("    str             " "q16, [x6, #16]"),
+        Q!("    str             " "q16, [x6, #32]"),
+        Q!("    str             " "q16, [x6, #48]"),
+        Q!("    add             " "x6, x6, #64"),
+        Q!("    subs            " i!() ", " i!() ", #8"),
+        Q!("    bne             " Label!("bignum_copy_row_from_table_8n_neon_initzero", 3, Before)),
+
+        Q!("    mov             " i!() ", xzr"),
+        Q!("    mov             " "x8, " table!()),
+
+        Q!(Label!("bignum_copy_row_from_table_8n_neon_outerloop", 4) ":"),
+
+        Q!("    cmp             " i!() ", " idx!()),
+        Q!("    csetm           " mask!() ", eq"),
+        Q!("    dup             " vmask!() ".2d, " mask!()),
+
+        Q!("    mov             " j!() ", " width!()),
+        Q!("    mov             " "x9, " z!()),
+
+        Q!(Label!("bignum_copy_row_from_table_8n_neon_innerloop", 5) ":"),
+
+        Q!("    ldr             " "q17, [x8]"),
+        Q!("    ldr             " "q18, [x9]"),
+        Q!("    bit             " "v18.16b, v17.16b, " vmask!() ".16b"),
+        Q!("    str             " "q18, [x9]"),
+
+        Q!("    ldr             " "q17, [x8, #16]"),
+        Q!("    ldr             " "q18, [x9, #16]"),
+        Q!("    bit             " "v18.16b, v17.16b, " vmask!() ".16b"),
+        Q!("    str             " "q18, [x9, #16]"),
+
+        Q!("    ldr             " "q17, [x8, #32]"),
+        Q!("    ldr             " "q18, [x9, #32]"),
+        Q!("    bit             " "v18.16b, v17.16b, " vmask!() ".16b"),
+        Q!("    str             " "q18, [x9, #32]"),
+
+        Q!("    ldr             " "q17, [x8, #48]"),
+        Q!("    ldr             " "q18, [x9, #48]"),
+        Q!("    bit             " "v18.16b, v17.16b, " vmask!() ".16b"),
+        Q!("    str             " "q18, [x9, #48]"),
+
+        Q!("    add             " "x8, x8, #64"),
+        Q!("    add             " "x9, x9, #64"),
+        Q!("    subs            " j!() ", " j!() ", #8"),
+        Q!("    bne             " Label!("bignum_copy_row_from_table_8n_neon_innerloop", 5, Before)),
+
+        Q!(Label!("bignum_copy_row_from_table_8n_neon_innerloop_done", 6) ":"),
+        Q!("    add             " i!() ", " i!() ", #1"),
+        Q!("    cmp             " i!() ", " height!()),
+        Q!("    bne             " Label!("bignum_copy_row_from_table_8n_neon_outerloop", 4, Before)),
+
+        Q!(Label!("bignum_copy_row_from_table_8n_neon_end", 2) ":"),
+        inout("x0") z.as_mut_ptr() => _,
+        inout("x1") table.as_ptr() => _,
+        inout("x2") height => _,
+        inout("x3") width => _,
+        inout("x4") index => _,
+        // clobbers
+        out("v16") _,
+        out("v17") _,
+        out("v18") _,
+        out("x5") _,
+        out("x6") _,
+        out("x7") _,
+        out("x8") _,
+        out("x9") _,
+            )
+    };
+}
diff --git a/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs b/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs
new file mode 100644
index 00000000..f14f0471
--- /dev/null
+++ b/graviola/src/low/aarch64/bignum_copy_row_from_table_mux.rs
@@ -0,0 +1,29 @@
+// Written for Graviola by Joe Birr-Pixton, 2024.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+/// Multiplex between NEON specialisations of `bignum_copy_row_from_table`
+#[inline]
+pub fn bignum_copy_row_from_table(
+    z: &mut [u64],
+    table: &[u64],
+    height: u64,
+    width: u64,
+    index: u64,
+) {
+    match width {
+        32 => super::bignum_copy_row_from_table_32_neon::bignum_copy_row_from_table_32_neon(
+            z, table, height, index,
+        ),
+        16 => super::bignum_copy_row_from_table_16_neon::bignum_copy_row_from_table_16_neon(
+            z, table, height, index,
+        ),
+        width if width % 8 == 0 => {
+            super::bignum_copy_row_from_table_8n_neon::bignum_copy_row_from_table_8n_neon(
+                z, table, height, width, index,
+            )
+        }
+        width => super::bignum_copy_row_from_table::bignum_copy_row_from_table(
+            z, table, height, width, index,
+        ),
+    }
+}
diff --git a/graviola/src/low/aarch64/mod.rs b/graviola/src/low/aarch64/mod.rs
index ace2d1f6..7873a622 100644
--- a/graviola/src/low/aarch64/mod.rs
+++ b/graviola/src/low/aarch64/mod.rs
@@ -8,6 +8,10 @@ pub(crate) mod bignum_add_p384;
 pub(crate) mod bignum_bitsize;
 pub(crate) mod bignum_cmp_lt;
 pub(crate) mod bignum_copy_row_from_table;
+pub(crate) mod bignum_copy_row_from_table_16_neon;
+pub(crate) mod bignum_copy_row_from_table_32_neon;
+pub(crate) mod bignum_copy_row_from_table_8n_neon;
+pub(crate) mod bignum_copy_row_from_table_mux;
 pub(crate) mod bignum_demont;
 pub(crate) mod bignum_demont_p256;
 pub(crate) mod bignum_demont_p384;
diff --git a/graviola/src/low/mod.rs b/graviola/src/low/mod.rs
index e0d47d46..7d5016fa 100644
--- a/graviola/src/low/mod.rs
+++ b/graviola/src/low/mod.rs
@@ -98,7 +98,7 @@ cfg_if::cfg_if! {
         pub(crate) use aarch64::bignum_add_p384::bignum_add_p384;
         pub(crate) use aarch64::bignum_bitsize::bignum_bitsize;
         pub(crate) use aarch64::bignum_cmp_lt::bignum_cmp_lt;
-        pub(crate) use aarch64::bignum_copy_row_from_table::bignum_copy_row_from_table;
+        pub(crate) use aarch64::bignum_copy_row_from_table_mux::bignum_copy_row_from_table;
         pub(crate) use aarch64::bignum_point_select_p256::{bignum_aff_point_select_p256, bignum_jac_point_select_p256};
         pub(crate) use aarch64::bignum_point_select_p384::bignum_jac_point_select_p384;
         pub(crate) use aarch64::bignum_demont::bignum_demont;