Skip to content

Commit

Permalink
Add AVX2 implementation of first_max_element
Browse files Browse the repository at this point in the history
This also now requires BMI1 and BMI2 for AVX2 in `CpuFeatureLevel`.
  • Loading branch information
redzic committed Sep 15, 2021
1 parent 2ec4e67 commit 248542a
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 51 deletions.
4 changes: 2 additions & 2 deletions src/asm/x86/dist/hbd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ macro_rules! satd_hbd_avx2 {
($(($W:expr, $H:expr)),*) => {
$(
paste::item! {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe extern fn [<rav1e_satd_ $W x $H _hbd_avx2>](
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
) -> u32 {
Expand Down Expand Up @@ -43,7 +43,7 @@ macro_rules! satd_kernel_hbd_avx2 {
($(($W:expr, $H:expr)),*) => {
$(
paste::item! {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe extern fn [<satd_kernel_ $W x $H _hbd_avx2>](
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
) -> u64 {
Expand Down
20 changes: 10 additions & 10 deletions src/asm/x86/lrf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ static X_BY_XPLUS1: [u32; 256] = [
];

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_ab_8_avx2(
r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
Expand All @@ -169,7 +169,7 @@ unsafe fn sgrproj_box_ab_8_avx2(

// Using an integral image, compute the sum of a square region
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn get_integral_square_avx2(
iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
) -> __m256i {
Expand Down Expand Up @@ -234,7 +234,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
_mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
Expand Down Expand Up @@ -293,7 +293,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
}
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
Expand Down Expand Up @@ -353,7 +353,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
) {
Expand All @@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
) {
Expand All @@ -396,7 +396,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
cdeffed: &PlaneSlice<T>,
Expand Down Expand Up @@ -496,7 +496,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
cdeffed: &PlaneSlice<T>,
Expand All @@ -519,7 +519,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
x: usize, y: usize, cdeffed: &PlaneSlice<T>,
Expand Down Expand Up @@ -618,7 +618,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
);
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
y: usize, w: usize, cdeffed: &PlaneSlice<T>,
Expand Down
2 changes: 1 addition & 1 deletion src/asm/x86/quantize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ pub fn dequantize<T: Coefficient>(
}
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn dequantize_avx2(
qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
Expand Down
38 changes: 19 additions & 19 deletions src/asm/x86/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,27 @@ struct I32X8 {
}

impl I32X8 {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
const unsafe fn vec(self) -> __m256i {
self.data
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
const unsafe fn new(a: __m256i) -> I32X8 {
I32X8 { data: a }
}
}

impl TxOperations for I32X8 {
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn zero() -> Self {
I32X8::new(_mm256_setzero_si256())
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn tx_mul(self, mul: (i32, i32)) -> Self {
I32X8::new(_mm256_srav_epi32(
Expand All @@ -95,7 +95,7 @@ impl TxOperations for I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn rshift1(self) -> Self {
I32X8::new(_mm256_srai_epi32(
Expand All @@ -107,34 +107,34 @@ impl TxOperations for I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn add(self, b: Self) -> Self {
I32X8::new(_mm256_add_epi32(self.vec(), b.vec()))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn sub(self, b: Self) -> Self {
I32X8::new(_mm256_sub_epi32(self.vec(), b.vec()))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn add_avg(self, b: Self) -> Self {
I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn sub_avg(self, b: Self) -> Self {
I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1))
}
}

impl_1d_tx!(target_feature(enable = "avx2"), unsafe);
impl_1d_tx!(target_feature(enable = "avx2,bmi1,bmi2"), unsafe);

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_8x8_avx2(
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -175,7 +175,7 @@ unsafe fn transpose_8x8_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_8x4_avx2(
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -213,7 +213,7 @@ unsafe fn transpose_8x4_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_4x8_avx2(
input: (I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
Expand Down Expand Up @@ -246,7 +246,7 @@ unsafe fn transpose_4x8_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn transpose_4x4_avx2(
input: (I32X8, I32X8, I32X8, I32X8),
) -> (I32X8, I32X8, I32X8, I32X8) {
Expand All @@ -265,13 +265,13 @@ unsafe fn transpose_4x4_avx2(
)
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 {
I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32)))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
I32X8::new(_mm256_srav_epi32(
Expand All @@ -280,7 +280,7 @@ unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
))
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn round_shift_array_avx2(arr: &mut [I32X8], size: usize, bit: i8) {
if bit == 0 {
Expand Down Expand Up @@ -328,7 +328,7 @@ impl SizeClass1D {
}

#[allow(clippy::identity_op, clippy::erasing_op)]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn forward_transform_avx2<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
tx_type: TxType, bd: usize,
Expand All @@ -355,7 +355,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
// Columns
for cg in (0..txfm_size_col).step_by(8) {
let shift = cfg.shift[0] as u8;
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx2,bmi1,bmi2")]
#[inline]
unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 {
// TODO: load 64-bits for x4 wide columns
Expand Down
83 changes: 65 additions & 18 deletions src/cdef.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,58 @@ pub(crate) mod rust {
///
/// # Arguments
///
/// * `elems` - A non-empty slice of integers
///
/// # Panics
///
/// Panics if `elems` is empty
/// * `elems` - A slice of 8 `i32`s
#[inline]
fn first_max_element(elems: &[i32]) -> (usize, i32) {
// In case of a tie, the first element must be selected.
let (max_idx, max_value) = elems
.iter()
.enumerate()
.max_by_key(|&(i, v)| (v, -(i as isize)))
.unwrap();
(max_idx, *max_value)
#[allow(clippy::collapsible_if)]
fn first_max_element(
elems: &[i32; 8], cpu: CpuFeatureLevel,
) -> (usize, i32) {
// Same as `first_max_element`, but implemented with AVX2 intrinsics
#[inline]
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2,bmi1,bmi2")]
unsafe fn first_max_element_avx2(elems: &[i32; 8]) -> (usize, i32) {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

// the compiler autovectorizes this
let max_val = *elems.iter().max().unwrap();

let cmp = _mm256_cmpeq_epi32(
_mm256_loadu_si256(elems as *const i32 as *const _),
_mm256_set1_epi32(max_val),
);
// this intrinsic is supposed to be for floating point, but it works
// fine on integer data as well
let mask = _mm256_movemask_ps(std::mem::transmute(cmp));

(mask.trailing_zeros() as usize, max_val)
}

#[inline]
fn _first_max_element(elems: &[i32; 8]) -> (usize, i32) {
// In case of a tie, the first element must be selected.
let (max_idx, max_value) = elems
.iter()
.enumerate()
.max_by_key(|&(i, v)| (v, -(i as isize)))
.unwrap();
(max_idx, *max_value)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
if cpu >= CpuFeatureLevel::AVX2 {
let result = unsafe { first_max_element_avx2(elems) };

#[cfg(feature = "check_asm")]
assert_eq!(result, _first_max_element(elems));

return result;
}

_first_max_element(elems)
}

// Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
Expand All @@ -84,7 +122,7 @@ pub(crate) mod rust {
// http://jmvalin.ca/notes/intra_paint.pdf
pub fn cdef_find_dir<T: Pixel>(
img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize,
_cpu: CpuFeatureLevel,
cpu: CpuFeatureLevel,
) -> i32 {
let mut cost: [i32; 8] = [0; 8];
let mut partial: [[i32; 15]; 8] = [[0; 15]; 8];
Expand Down Expand Up @@ -133,7 +171,7 @@ pub(crate) mod rust {
}
}

let (best_dir, best_cost) = first_max_element(&cost);
let (best_dir, best_cost) = first_max_element(&cost, cpu);
// Difference between the optimal variance and the variance along the
// orthogonal direction. Again, the sum(x^2) terms cancel out.
// We'd normally divide by 840, but dividing by 1024 is close enough
Expand Down Expand Up @@ -305,9 +343,18 @@ pub(crate) mod rust {

#[test]
fn check_max_element() {
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6));
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7));
assert_eq!(first_max_element(&[0, 0]), (0, 0));
assert_eq!(
first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6], CpuFeatureLevel::RUST),
(6, 6)
);
assert_eq!(
first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6], CpuFeatureLevel::RUST),
(6, 7)
);
assert_eq!(
first_max_element(&[0, 0, 0, 0, 0, 0, 0, 0], CpuFeatureLevel::RUST),
(0, 0)
);
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/cpu_features/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ impl Default for CpuFeatureLevel {
CpuFeatureLevel::AVX512ICL
} else if avx512_detected() {
CpuFeatureLevel::AVX512
} else if is_x86_feature_detected!("avx2") {
} else if is_x86_feature_detected!("avx2")
&& is_x86_feature_detected!("bmi1")
&& is_x86_feature_detected!("bmi2")
{
CpuFeatureLevel::AVX2
} else if is_x86_feature_detected!("sse4.1") {
CpuFeatureLevel::SSE4_1
Expand Down

0 comments on commit 248542a

Please sign in to comment.