Skip to content

Commit

Permalink
Add avx2 width=16 bignum_copy_row_from_table specialisation
Browse files Browse the repository at this point in the history
  • Loading branch information
ctz committed Sep 28, 2024
1 parent 76a026d commit ff13cf8
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 0 deletions.
57 changes: 57 additions & 0 deletions graviola/src/low/x86_64/bignum_copy_row_from_table_16_avx2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Written for Graviola by Joe Birr-Pixton, 2024.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0

use core::arch::x86_64::*;

pub fn bignum_copy_row_from_table_16_avx2(z: &mut [u64], table: &[u64], _height: u64, index: u64) {
debug_assert!(z.len() == 16);
debug_assert!(index < _height);
debug_assert!(table.len() == (_height as usize) * z.len());

unsafe { _bignum_copy_row_from_table_16_avx2(z, table, index) }
}

#[target_feature(enable = "avx,avx2")]
unsafe fn _bignum_copy_row_from_table_16_avx2(z: &mut [u64], table: &[u64], index: u64) {
_mm_prefetch(table.as_ptr().cast(), _MM_HINT_T0);
_mm_prefetch(table.as_ptr().add(16).cast(), _MM_HINT_T0);

let mut acc0 = _mm256_setzero_si256();
let mut acc1 = _mm256_setzero_si256();
let mut acc2 = _mm256_setzero_si256();
let mut acc3 = _mm256_setzero_si256();

let desired_index = _mm_set1_epi64x(index as i64);
let desired_index = _mm256_setr_m128i(desired_index, desired_index);

let index = _mm_set1_epi64x(0);
let mut index = _mm256_setr_m128i(index, index);

let ones = _mm_set1_epi64x(1);
let ones = _mm256_setr_m128i(ones, ones);

for row in table.chunks_exact(16) {
let mask = _mm256_cmpeq_epi64(index, desired_index);
index = _mm256_add_epi64(index, ones);

let row0 = _mm256_loadu_si256(row.as_ptr().add(0).cast());
let row1 = _mm256_loadu_si256(row.as_ptr().add(4).cast());
let row2 = _mm256_loadu_si256(row.as_ptr().add(8).cast());
let row3 = _mm256_loadu_si256(row.as_ptr().add(12).cast());

let row0 = _mm256_and_si256(row0, mask);
let row1 = _mm256_and_si256(row1, mask);
let row2 = _mm256_and_si256(row2, mask);
let row3 = _mm256_and_si256(row3, mask);

acc0 = _mm256_xor_si256(row0, acc0);
acc1 = _mm256_xor_si256(row1, acc1);
acc2 = _mm256_xor_si256(row2, acc2);
acc3 = _mm256_xor_si256(row3, acc3);
}

_mm256_storeu_si256(z.as_mut_ptr().add(0).cast(), acc0);
_mm256_storeu_si256(z.as_mut_ptr().add(4).cast(), acc1);
_mm256_storeu_si256(z.as_mut_ptr().add(8).cast(), acc2);
_mm256_storeu_si256(z.as_mut_ptr().add(12).cast(), acc3);
}
3 changes: 3 additions & 0 deletions graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ pub fn bignum_copy_row_from_table(
index: u64,
) {
match width {
16 => super::bignum_copy_row_from_table_16_avx2::bignum_copy_row_from_table_16_avx2(
z, table, height, index,
),
width if width % 8 == 0 => {
super::bignum_copy_row_from_table_8n_avx2::bignum_copy_row_from_table_8n_avx2(
z, table, height, width, index,
Expand Down
1 change: 1 addition & 0 deletions graviola/src/low/x86_64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub(crate) mod bignum_add_p384;
pub(crate) mod bignum_bitsize;
pub(crate) mod bignum_cmp_lt;
pub(crate) mod bignum_copy_row_from_table;
pub(crate) mod bignum_copy_row_from_table_16_avx2;
pub(crate) mod bignum_copy_row_from_table_8n_avx2;
pub(crate) mod bignum_copy_row_from_table_mux;
pub(crate) mod bignum_demont;
Expand Down

0 comments on commit ff13cf8

Please sign in to comment.