Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AVX2 implementation of first_max_element #2806

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/asm/x86/dist/hbd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ macro_rules! satd_hbd_avx2 {
($(($W:expr, $H:expr)),*) => {
$(
paste::item! {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe extern fn [<rav1e_satd_ $W x $H _hbd_avx2>](
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
) -> u32 {
Expand Down Expand Up @@ -43,7 +43,7 @@ macro_rules! satd_kernel_hbd_avx2 {
($(($W:expr, $H:expr)),*) => {
$(
paste::item! {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe extern fn [<satd_kernel_ $W x $H _hbd_avx2>](
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
) -> u64 {
Expand Down
20 changes: 10 additions & 10 deletions src/asm/x86/lrf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ static X_BY_XPLUS1: [u32; 256] = [
];

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_ab_8_avx2(
r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
Expand All @@ -169,7 +169,7 @@ unsafe fn sgrproj_box_ab_8_avx2(

// Using an integral image, compute the sum of a square region
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn get_integral_square_avx2(
iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
) -> __m256i {
Expand Down Expand Up @@ -234,7 +234,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
_mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
Expand Down Expand Up @@ -293,7 +293,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
}
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
Expand Down Expand Up @@ -353,7 +353,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
) {
Expand All @@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
) {
Expand All @@ -396,7 +396,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
cdeffed: &PlaneSlice<T>,
Expand Down Expand Up @@ -496,7 +496,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
cdeffed: &PlaneSlice<T>,
Expand All @@ -519,7 +519,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
x: usize, y: usize, cdeffed: &PlaneSlice<T>,
Expand Down Expand Up @@ -618,7 +618,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
y: usize, w: usize, cdeffed: &PlaneSlice<T>,
Expand Down
2 changes: 1 addition & 1 deletion src/asm/x86/quantize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ pub fn dequantize<T: Coefficient>(
}
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn dequantize_avx2(
qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
Expand Down
38 changes: 19 additions & 19 deletions src/asm/x86/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,27 @@ struct I32X8 {
}

impl I32X8 {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
const unsafe fn vec(self) -> __m256i {
self.data
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
const unsafe fn new(a: __m256i) -> I32X8 {
I32X8 { data: a }
}
}

impl TxOperations for I32X8 {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn zero() -> Self {
I32X8::new(_mm256_setzero_si256())
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn tx_mul(self, mul: (i32, i32)) -> Self {
I32X8::new(_mm256_srav_epi32(
Expand All @@ -95,7 +95,7 @@ impl TxOperations for I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn rshift1(self) -> Self {
I32X8::new(_mm256_srai_epi32(
Expand All @@ -107,34 +107,34 @@ impl TxOperations for I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn add(self, b: Self) -> Self {
I32X8::new(_mm256_add_epi32(self.vec(), b.vec()))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn sub(self, b: Self) -> Self {
I32X8::new(_mm256_sub_epi32(self.vec(), b.vec()))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn add_avg(self, b: Self) -> Self {
I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn sub_avg(self, b: Self) -> Self {
I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1))
}
}

impl_1d_tx!(target_feature(enable = "avx2"), unsafe);
impl_1d_tx!(target_feature(enable = "avx2,bmi1,bmi2"), unsafe);

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_8x8_avx2(
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -175,7 +175,7 @@ unsafe fn transpose_8x8_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_8x4_avx2(
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -213,7 +213,7 @@ unsafe fn transpose_8x4_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_4x8_avx2(
input: (I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -246,7 +246,7 @@ unsafe fn transpose_4x8_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_4x4_avx2(
input: (I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8) {
Expand All @@ -265,13 +265,13 @@ unsafe fn transpose_4x4_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 {
I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32)))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
I32X8::new(_mm256_srav_epi32(
Expand All @@ -280,7 +280,7 @@ unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn round_shift_array_avx2(arr: &mut [I32X8], size: usize, bit: i8) {
if bit == 0 {
Expand Down Expand Up @@ -328,7 +328,7 @@ impl SizeClass1D {
}

#[allow(clippy::identity_op, clippy::erasing_op)]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn forward_transform_avx2<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
tx_type: TxType, bd: usize,
Expand All @@ -355,7 +355,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
// Columns
for cg in (0..txfm_size_col).step_by(8) {
let shift = cfg.shift[0] as u8;
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 {
// TODO: load 64-bits for x4 wide columns
Expand Down
82 changes: 64 additions & 18 deletions src/cdef.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,57 @@ pub(crate) mod rust {
///
/// # Arguments
///
/// * `elems` - A non-empty slice of integers
///
/// # Panics
///
/// Panics if `elems` is empty
/// * `elems` - A slice of 8 `i32`s
#[inline]
fn first_max_element(elems: &[i32]) -> (usize, i32) {
// In case of a tie, the first element must be selected.
let (max_idx, max_value) = elems
.iter()
.enumerate()
.max_by_key(|&(i, v)| (v, -(i as isize)))
.unwrap();
(max_idx, *max_value)
fn first_max_element(
elems: &[i32; 8], cpu: CpuFeatureLevel,
) -> (usize, i32) {
// Same as `first_max_element`, but implemented with AVX2 intrinsics
#[inline]
#[cfg(nasm_x86_64)]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn first_max_element_avx2(elems: &[i32; 8]) -> (usize, i32) {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

// the compiler autovectorizes this
let max_val = *elems.iter().max().unwrap();

let cmp = _mm256_cmpeq_epi32(
_mm256_loadu_si256(elems as *const i32 as *const _),
_mm256_set1_epi32(max_val),
);
// this intrinsic is supposed to be for floating point, but it works
// fine on integer data as well
let mask = _mm256_movemask_ps(std::mem::transmute(cmp));

(mask.trailing_zeros() as usize, max_val)
}

#[inline]
fn first_max_element(elems: &[i32; 8]) -> (usize, i32) {
// In case of a tie, the first element must be selected.
let (max_idx, max_value) = elems
.iter()
.enumerate()
.max_by_key(|&(i, v)| (v, -(i as isize)))
.unwrap();
(max_idx, *max_value)
}

#[cfg(nasm_x86_64)]
if cpu >= CpuFeatureLevel::AVX2 {
let result = unsafe { first_max_element_avx2(elems) };

#[cfg(feature = "check_asm")]
assert_eq!(result, first_max_element(elems));

return result;
}

first_max_element(elems)
}

// Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
Expand All @@ -84,7 +121,7 @@ pub(crate) mod rust {
// http://jmvalin.ca/notes/intra_paint.pdf
pub fn cdef_find_dir<T: Pixel>(
img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize,
_cpu: CpuFeatureLevel,
cpu: CpuFeatureLevel,
) -> i32 {
let mut cost: [i32; 8] = [0; 8];
let mut partial: [[i32; 15]; 8] = [[0; 15]; 8];
Expand Down Expand Up @@ -133,7 +170,7 @@ pub(crate) mod rust {
}
}

let (best_dir, best_cost) = first_max_element(&cost);
let (best_dir, best_cost) = first_max_element(&cost, cpu);
// Difference between the optimal variance and the variance along the
// orthogonal direction. Again, the sum(x^2) terms cancel out.
// We'd normally divide by 840, but dividing by 1024 is close enough
Expand Down Expand Up @@ -305,9 +342,18 @@ pub(crate) mod rust {

#[test]
fn check_max_element() {
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6));
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7));
assert_eq!(first_max_element(&[0, 0]), (0, 0));
assert_eq!(
first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6], CpuFeatureLevel::RUST),
(6, 6)
);
assert_eq!(
first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6], CpuFeatureLevel::RUST),
(6, 7)
);
assert_eq!(
first_max_element(&[0, 0, 0, 0, 0, 0, 0, 0], CpuFeatureLevel::RUST),
(0, 0)
);
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/cpu_features/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ impl Default for CpuFeatureLevel {
CpuFeatureLevel::AVX512ICL
} else if avx512_detected() {
CpuFeatureLevel::AVX512
} else if is_x86_feature_detected!("avx2") {
} else if is_x86_feature_detected!("avx2")
&& is_x86_feature_detected!("bmi1")
&& is_x86_feature_detected!("bmi2")
{
CpuFeatureLevel::AVX2
} else if is_x86_feature_detected!("sse4.1") {
CpuFeatureLevel::SSE4_1
Expand Down