xiph · redzic · Sep 15, 2021
diff --git a/src/asm/x86/dist/hbd.rs b/src/asm/x86/dist/hbd.rs
@@ -5,7 +5,7 @@ macro_rules! satd_hbd_avx2 {
   ($(($W:expr, $H:expr)),*) => {
     $(
       paste::item! {
-        #[target_feature(enable = "avx2")]
+        #[target_feature(enable = "avx2,bmi1,bmi2")]
         pub(crate) unsafe extern fn [<rav1e_satd_ $W x $H _hbd_avx2>](
           src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
         ) -> u32 {
@@ -43,7 +43,7 @@ macro_rules! satd_kernel_hbd_avx2 {
   ($(($W:expr, $H:expr)),*) => {
     $(
       paste::item! {
-        #[target_feature(enable = "avx2")]
+        #[target_feature(enable = "avx2,bmi1,bmi2")]
         unsafe extern fn [<satd_kernel_ $W x $H _hbd_avx2>](
           src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
         ) -> u64 {

diff --git a/src/asm/x86/lrf.rs b/src/asm/x86/lrf.rs
@@ -158,7 +158,7 @@ static X_BY_XPLUS1: [u32; 256] = [
 ];
 
 #[inline]
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn sgrproj_box_ab_8_avx2(
   r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
   iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
@@ -169,7 +169,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
 
   // Using an integral image, compute the sum of a square region
   #[inline]
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   unsafe fn get_integral_square_avx2(
     iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
   ) -> __m256i {
@@ -234,7 +234,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
   _mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
@@ -293,7 +293,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
   }
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
@@ -353,7 +353,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
 }
 
 #[inline]
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
   f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
 ) {
@@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
   );
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
   f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
 ) {
@@ -396,7 +396,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
 }
 
 #[inline]
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
   af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
   cdeffed: &PlaneSlice<T>,
@@ -496,7 +496,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
   );
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
   af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
   cdeffed: &PlaneSlice<T>,
@@ -519,7 +519,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
 }
 
 #[inline]
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
   af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
   x: usize, y: usize, cdeffed: &PlaneSlice<T>,
@@ -618,7 +618,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
   );
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
   af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
   y: usize, w: usize, cdeffed: &PlaneSlice<T>,

diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
@@ -85,7 +85,7 @@ pub fn dequantize<T: Coefficient>(
   }
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn dequantize_avx2(
   qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
   tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,

diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
@@ -63,27 +63,27 @@ struct I32X8 {
 }
 
 impl I32X8 {
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   const unsafe fn vec(self) -> __m256i {
     self.data
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   const unsafe fn new(a: __m256i) -> I32X8 {
     I32X8 { data: a }
   }
 }
 
 impl TxOperations for I32X8 {
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn zero() -> Self {
     I32X8::new(_mm256_setzero_si256())
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn tx_mul(self, mul: (i32, i32)) -> Self {
     I32X8::new(_mm256_srav_epi32(
@@ -95,7 +95,7 @@ impl TxOperations for I32X8 {
     ))
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn rshift1(self) -> Self {
     I32X8::new(_mm256_srai_epi32(
@@ -107,34 +107,34 @@ impl TxOperations for I32X8 {
     ))
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn add(self, b: Self) -> Self {
     I32X8::new(_mm256_add_epi32(self.vec(), b.vec()))
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn sub(self, b: Self) -> Self {
     I32X8::new(_mm256_sub_epi32(self.vec(), b.vec()))
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn add_avg(self, b: Self) -> Self {
     I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1))
   }
 
-  #[target_feature(enable = "avx2")]
+  #[target_feature(enable = "avx2,bmi1,bmi2")]
   #[inline]
   unsafe fn sub_avg(self, b: Self) -> Self {
     I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1))
   }
 }
 
-impl_1d_tx!(target_feature(enable = "avx2"), unsafe);
+impl_1d_tx!(target_feature(enable = "avx2,bmi1,bmi2"), unsafe);
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn transpose_8x8_avx2(
   input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
 ) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
@@ -175,7 +175,7 @@ unsafe fn transpose_8x8_avx2(
   )
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn transpose_8x4_avx2(
   input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
 ) -> (I32X8, I32X8, I32X8, I32X8) {
@@ -213,7 +213,7 @@ unsafe fn transpose_8x4_avx2(
   )
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn transpose_4x8_avx2(
   input: (I32X8, I32X8, I32X8, I32X8),
 ) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
@@ -246,7 +246,7 @@ unsafe fn transpose_4x8_avx2(
   )
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn transpose_4x4_avx2(
   input: (I32X8, I32X8, I32X8, I32X8),
 ) -> (I32X8, I32X8, I32X8, I32X8) {
@@ -265,13 +265,13 @@ unsafe fn transpose_4x4_avx2(
   )
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 #[inline]
 unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 {
   I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32)))
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 #[inline]
 unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
   I32X8::new(_mm256_srav_epi32(
@@ -280,7 +280,7 @@ unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
   ))
 }
 
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 #[inline]
 unsafe fn round_shift_array_avx2(arr: &mut [I32X8], size: usize, bit: i8) {
   if bit == 0 {
@@ -328,7 +328,7 @@ impl SizeClass1D {
 }
 
 #[allow(clippy::identity_op, clippy::erasing_op)]
-#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx2,bmi1,bmi2")]
 unsafe fn forward_transform_avx2<T: Coefficient>(
   input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
   tx_type: TxType, bd: usize,
@@ -355,7 +355,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
   // Columns
   for cg in (0..txfm_size_col).step_by(8) {
     let shift = cfg.shift[0] as u8;
-    #[target_feature(enable = "avx2")]
+    #[target_feature(enable = "avx2,bmi1,bmi2")]
     #[inline]
     unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 {
       // TODO: load 64-bits for x4 wide columns

diff --git a/src/cdef.rs b/src/cdef.rs
@@ -59,20 +59,57 @@ pub(crate) mod rust {
   ///
   /// # Arguments
   ///
-  /// * `elems` - A non-empty slice of integers
-  ///
-  /// # Panics
-  ///
-  /// Panics if `elems` is empty
+  /// * `elems` - A slice of 8 `i32`s
   #[inline]
-  fn first_max_element(elems: &[i32]) -> (usize, i32) {
-    // In case of a tie, the first element must be selected.
-    let (max_idx, max_value) = elems
-      .iter()
-      .enumerate()
-      .max_by_key(|&(i, v)| (v, -(i as isize)))
-      .unwrap();
-    (max_idx, *max_value)
+  fn first_max_element(
+    elems: &[i32; 8], cpu: CpuFeatureLevel,
+  ) -> (usize, i32) {
+    // Same as `first_max_element`, but implemented with AVX2 intrinsics
+    #[inline]
+    #[cfg(nasm_x86_64)]
+    #[target_feature(enable = "avx2,bmi1,bmi2")]
+    unsafe fn first_max_element_avx2(elems: &[i32; 8]) -> (usize, i32) {
+      #[cfg(target_arch = "x86")]
+      use std::arch::x86::*;
+      #[cfg(target_arch = "x86_64")]
+      use std::arch::x86_64::*;
+
+      // the compiler autovectorizes this
+      let max_val = *elems.iter().max().unwrap();
+
+      let cmp = _mm256_cmpeq_epi32(
+        _mm256_loadu_si256(elems as *const i32 as *const _),
+        _mm256_set1_epi32(max_val),
+      );
+      // this intrinsic is supposed to be for floating point, but it works
+      // fine on integer data as well
+      let mask = _mm256_movemask_ps(std::mem::transmute(cmp));
+
+      (mask.trailing_zeros() as usize, max_val)
+    }
+
+    #[inline]
+    fn first_max_element(elems: &[i32; 8]) -> (usize, i32) {
+      // In case of a tie, the first element must be selected.
+      let (max_idx, max_value) = elems
+        .iter()
+        .enumerate()
+        .max_by_key(|&(i, v)| (v, -(i as isize)))
+        .unwrap();
+      (max_idx, *max_value)
+    }
+
+    #[cfg(nasm_x86_64)]
+    if cpu >= CpuFeatureLevel::AVX2 {
+      let result = unsafe { first_max_element_avx2(elems) };
+
+      #[cfg(feature = "check_asm")]
+      assert_eq!(result, first_max_element(elems));
+
+      return result;
+    }
+
+    first_max_element(elems)
   }
 
   // Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
@@ -84,7 +121,7 @@ pub(crate) mod rust {
   // http://jmvalin.ca/notes/intra_paint.pdf
   pub fn cdef_find_dir<T: Pixel>(
     img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize,
-    _cpu: CpuFeatureLevel,
+    cpu: CpuFeatureLevel,
   ) -> i32 {
     let mut cost: [i32; 8] = [0; 8];
     let mut partial: [[i32; 15]; 8] = [[0; 15]; 8];
@@ -133,7 +170,7 @@ pub(crate) mod rust {
       }
     }
 
-    let (best_dir, best_cost) = first_max_element(&cost);
+    let (best_dir, best_cost) = first_max_element(&cost, cpu);
     // Difference between the optimal variance and the variance along the
     // orthogonal direction. Again, the sum(x^2) terms cancel out.
     // We'd normally divide by 840, but dividing by 1024 is close enough
@@ -305,9 +342,18 @@ pub(crate) mod rust {
 
     #[test]
     fn check_max_element() {
-      assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6));
-      assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7));
-      assert_eq!(first_max_element(&[0, 0]), (0, 0));
+      assert_eq!(
+        first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6], CpuFeatureLevel::RUST),
+        (6, 6)
+      );
+      assert_eq!(
+        first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6], CpuFeatureLevel::RUST),
+        (6, 7)
+      );
+      assert_eq!(
+        first_max_element(&[0, 0, 0, 0, 0, 0, 0, 0], CpuFeatureLevel::RUST),
+        (0, 0)
+      );
     }
   }
 }

diff --git a/src/cpu_features/x86.rs b/src/cpu_features/x86.rs
@@ -62,7 +62,10 @@ impl Default for CpuFeatureLevel {
       CpuFeatureLevel::AVX512ICL
     } else if avx512_detected() {
       CpuFeatureLevel::AVX512
-    } else if is_x86_feature_detected!("avx2") {
+    } else if is_x86_feature_detected!("avx2")
+      && is_x86_feature_detected!("bmi1")
+      && is_x86_feature_detected!("bmi2")
+    {
       CpuFeatureLevel::AVX2
     } else if is_x86_feature_detected!("sse4.1") {
       CpuFeatureLevel::SSE4_1