SSE2 support

This deinterleaves 8-bit pairs via the PACKUSWB instruction: either shifting right to get the high 8 bits of every 16, or ANDing to get the low 8 bits of every 16. I took that idea from compiling the following code: https://godbolt.org/z/enaMY7v4o ```rust use std::arch::x86_64; use std::simd::i8x16; pub unsafe fn process( top_uyvy_addr: *const u8, bot_uyvy_addr: *const u8, top_y_addr: *mut u8, bot_y_addr: *mut u8, u_addr: *mut u8, v_addr: *mut u8, ) { let [top_uv, bot_uv] = [ (top_uyvy_addr, top_y_addr), (bot_uyvy_addr, bot_y_addr), ].map(|(uyvy_addr, y_addr)| { let uyvy = std::ptr::read_unaligned(uyvy_addr as *const [i8x16; 4]); let (uv_hi, y_hi) = uyvy[0].deinterleave(uyvy[1]); let (uv_lo, y_lo) = uyvy[2].deinterleave(uyvy[3]); std::ptr::write_unaligned(y_addr as *mut i8x16, y_hi); std::ptr::write_unaligned(y_addr.add(16) as *mut i8x16, y_lo); [uv_hi, uv_lo] }); let uv = [ i8x16::from(x86_64::_mm_avg_epu8(top_uv[0].into(), bot_uv[0].into())), i8x16::from(x86_64::_mm_avg_epu8(top_uv[1].into(), bot_uv[1].into())), ]; let (u, v) = uv[0].deinterleave(uv[1]); std::ptr::write_unaligned(u_addr as *mut i8x16, u); std::ptr::write_unaligned(v_addr as *mut i8x16, v); } ``` Its performance is surprisingly good: 24 GB/s cold, 73 GB/s hot. Some noise in all these measurements. ``` cold/memcpy_baseline time: [6.1385 ms 6.1503 ms 6.1639 ms] thrpt: [35.090 GiB/s 35.168 GiB/s 35.236 GiB/s] change: time: [+0.8184% +1.1318% +1.4649%] (p = 0.00 < 0.05) thrpt: [-1.4438% -1.1191% -0.8117%] Change within noise threshold. cold/libyuv time: [8.9479 ms 8.9586 ms 8.9708 ms] thrpt: [24.111 GiB/s 24.144 GiB/s 24.173 GiB/s] change: time: [+1.8972% +2.0382% +2.2017%] (p = 0.00 < 0.05) thrpt: [-2.1543% -1.9975% -1.8619%] Performance has regressed. cold/explicit_avx2_double time: [8.5369 ms 8.5513 ms 8.5665 ms] thrpt: [25.249 GiB/s 25.294 GiB/s 25.336 GiB/s] change: time: [+12.556% +12.813% +13.076%] (p = 0.00 < 0.05) thrpt: [-11.564% -11.358% -11.155%] Performance has regressed. cold/explicit_avx2_single time: [8.0669 ms 8.0752 ms 8.0852 ms] thrpt: [26.752 GiB/s 26.785 GiB/s 26.812 GiB/s] change: time: [+1.6387% +1.7825% +1.9258%] (p = 0.00 < 0.05) thrpt: [-1.8894% -1.7513% -1.6123%] Performance has regressed. cold/explicit_sse2 time: [8.9443 ms 8.9541 ms 8.9652 ms] thrpt: [24.126 GiB/s 24.156 GiB/s 24.182 GiB/s] cold/auto_avx2_64 time: [32.122 ms 32.139 ms 32.158 ms] thrpt: [6.7260 GiB/s 6.7300 GiB/s 6.7335 GiB/s] change: time: [+0.4201% +0.4922% +0.5622%] (p = 0.00 < 0.05) thrpt: [-0.5591% -0.4898% -0.4184%] Change within noise threshold. Benchmarking cold/auto_vanilla_64: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 5.3s, or reduce sample count to 90. cold/auto_vanilla_64 time: [53.057 ms 53.092 ms 53.132 ms] thrpt: [4.0708 GiB/s 4.0739 GiB/s 4.0766 GiB/s] change: time: [-0.5970% -0.5149% -0.4217%] (p = 0.00 < 0.05) thrpt: [+0.4235% +0.5176% +0.6006%] Change within noise threshold. hot/memcpy_baseline time: [74.306 µs 74.385 µs 74.477 µs] thrpt: [90.755 GiB/s 90.867 GiB/s 90.964 GiB/s] change: time: [-0.1252% +0.1649% +0.5301%] (p = 0.39 > 0.05) thrpt: [-0.5273% -0.1646% +0.1254%] No change in performance detected. hot/libyuv time: [107.00 µs 107.04 µs 107.09 µs] thrpt: [63.116 GiB/s 63.145 GiB/s 63.170 GiB/s] change: time: [+4.9819% +5.1083% +5.2261%] (p = 0.00 < 0.05) thrpt: [-4.9665% -4.8600% -4.7455%] Performance has regressed. hot/explicit_avx2_double time: [90.068 µs 90.113 µs 90.155 µs] thrpt: [74.973 GiB/s 75.008 GiB/s 75.045 GiB/s] change: time: [+19.614% +20.304% +21.006%] (p = 0.00 < 0.05) thrpt: [-17.360% -16.877% -16.398%] Performance has regressed. hot/explicit_avx2_single time: [79.458 µs 79.556 µs 79.655 µs] thrpt: [84.856 GiB/s 84.961 GiB/s 85.066 GiB/s] change: time: [+6.9429% +7.3397% +7.6897%] (p = 0.00 < 0.05) thrpt: [-7.1406% -6.8378% -6.4921%] Performance has regressed. hot/explicit_sse2 time: [92.316 µs 92.406 µs 92.511 µs] thrpt: [73.063 GiB/s 73.146 GiB/s 73.218 GiB/s] hot/auto_avx2_64 time: [920.02 µs 920.20 µs 920.42 µs] thrpt: [7.3435 GiB/s 7.3453 GiB/s 7.3467 GiB/s] change: time: [+0.7899% +0.8556% +0.9185%] (p = 0.00 < 0.05) thrpt: [-0.9102% -0.8483% -0.7837%] Change within noise threshold. Benchmarking hot/auto_vanilla_64: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.1s, enable flat sampling, or reduce sample count to 50. hot/auto_vanilla_64 time: [1.6063 ms 1.6069 ms 1.6075 ms] thrpt: [4.2048 GiB/s 4.2064 GiB/s 4.2078 GiB/s] change: time: [-0.8669% -0.8069% -0.7479%] (p = 0.00 < 0.05) thrpt: [+0.7536% +0.8134% +0.8745%] Change within noise threshold. ```
infiniteathlete · Sep 19, 2024 · 4beb1f8 · 4beb1f8
1 parent 3c5c422
commit 4beb1f8
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -11,9 +11,6 @@ Limitations and future work:
     [UYVY](https://fourcc.org/pixel-format/yuv-uyvy/) to
     [I420](https://fourcc.org/pixel-format/yuv-i420/).
     More will be added as needed.
-*   Returns `Err` on x86\_64 CPUs that don't support
-    AVX2. We'll likely add an SSE2 fallback later. As SSE2 is in the core
-    x86\_64 instruction set, this would mean all x86\_64 CPUs would be supported.
 *   Expects to process full horizontal lines. This is likely to
     change to allow working on cropped regions.
 *   Does not support output to a frame with padding, as required by some

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -105,18 +105,14 @@ fn bench_common<const FRAMES_PER_ITER: usize>(
         (inputs.len() * (WIDTH * HEIGHT * 7) / 2) as u64,
     ));
     macro_rules! bench_block {
-        ($name:literal, $impl:ty) => {
+        ($name:literal, $p:expr) => {
+            let p = $p;
             g.bench_function($name, |b| {
                 b.iter(|| {
                     for i in &inputs {
-                        black_box(
-                            convert_with::<$impl, _, _>(
-                                i,
-                                &mut ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT)
-                                    .new_vec(),
-                            )
-                            .unwrap(),
-                        );
+                        let mut f =
+                            ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT).new_vec();
+                        black_box(convert_with(p, i, &mut f).unwrap());
                     }
                 })
             });
@@ -144,16 +140,24 @@ fn bench_common<const FRAMES_PER_ITER: usize>(
         })
     });
     #[cfg(target_arch = "x86_64")]
-    bench_block!("explicit_avx2_double", ExplicitAvx2DoubleBlock);
+    bench_block!(
+        "explicit_avx2_double",
+        ExplicitAvx2DoubleBlock::try_new().unwrap()
+    );
     #[cfg(target_arch = "x86_64")]
-    bench_block!("explicit_avx2_single", ExplicitAvx2SingleBlock);
+    bench_block!(
+        "explicit_avx2_single",
+        ExplicitAvx2SingleBlock::try_new().unwrap()
+    );
     #[cfg(target_arch = "x86_64")]
-    bench_block!("auto_avx2_64", AutoAvx2Block<64>);
+    bench_block!("explicit_sse2", ExplicitSse2::new());
+    #[cfg(target_arch = "x86_64")]
+    bench_block!("auto_avx2_64", AutoAvx2Block::<64>::try_new().unwrap());
     #[cfg(target_arch = "aarch64")]
-    bench_block!("explicit_neon", ExplicitNeon);
+    bench_block!("explicit_neon", ExplicitNeon::try_new().unwrap());
     #[cfg(target_arch = "aarch64")]
-    bench_block!("auto_neon_64", AutoNeonBlock<64>);
-    bench_block!("auto_vanilla_64", AutoVanillaBlock<64>);
+    bench_block!("auto_neon_64", AutoNeonBlock<64>::new().unwrap());
+    bench_block!("auto_vanilla_64", AutoVanillaBlock::<64>::try_new().unwrap());
     g.finish();
 }
 

diff --git a/src/uyvy_to_i420.rs b/src/uyvy_to_i420.rs
@@ -21,9 +21,6 @@ use crate::{
 /// Processes a block of 2 rows.
 #[doc(hidden)]
 pub trait RowProcessor: Copy + Clone + Sized + Send + Sync {
-    /// Returns true if this block type is supported on this machine.
-    fn new() -> Result<Self, ConversionError>;
-
     /// Processes a block `width` pixels wide, two rows high.
     ///
     /// # Safety
@@ -54,17 +51,24 @@ pub fn convert<FI: Frame, FO: FrameMut>(
     yuv_out: &mut FO,
 ) -> Result<(), ConversionError> {
     #[cfg(target_arch = "x86_64")]
-    return convert_with::<ExplicitAvx2DoubleBlock, _, _>(uyvy_in, yuv_out);
+    {
+        if let Ok(avx2) = ExplicitAvx2DoubleBlock::try_new() {
+            return convert_with(avx2, uyvy_in, yuv_out);
+        }
+        return convert_with(ExplicitSse2::new(), uyvy_in, yuv_out);
+    }
 
+    // NEON is always supported on `aarch64`.
     #[cfg(target_arch = "aarch64")]
-    return convert_with::<ExplicitNeon, _, _>(uyvy_in, yuv_out);
+    return convert_with(ExplicitNeon::new(), uyvy_in, yuv_out);
 
-    #[allow(unused)]
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
     Err(ConversionError("no block processor available"))
 }
 
 #[doc(hidden)]
 pub fn convert_with<P: RowProcessor, FI: Frame, FO: FrameMut>(
+    p: P,
     uyvy_in: &FI,
     yuv_out: &mut FO,
 ) -> Result<(), ConversionError> {
@@ -76,7 +80,6 @@ pub fn convert_with<P: RowProcessor, FI: Frame, FO: FrameMut>(
     {
         return Err(ConversionError("invalid arguments"));
     }
-    let p = P::new()?;
     let pixels = width * height;
     let uyvy_planes = uyvy_in.planes();
     let [uyvy_in] = &uyvy_planes[..] else {
@@ -191,16 +194,19 @@ unsafe fn fallback(
 pub struct ExplicitAvx2DoubleBlock(());
 
 #[cfg(target_arch = "x86_64")]
-impl RowProcessor for ExplicitAvx2DoubleBlock {
+impl ExplicitAvx2DoubleBlock {
     #[inline]
-    fn new() -> Result<Self, ConversionError> {
+    pub fn try_new() -> Result<Self, ConversionError> {
         if is_x86_feature_detected!("avx2") {
             Ok(Self(()))
         } else {
             Err(ConversionError("avx2 is not supported on this machine"))
         }
     }
+}
 
+#[cfg(target_arch = "x86_64")]
+impl RowProcessor for ExplicitAvx2DoubleBlock {
     #[target_feature(enable = "avx2")]
     #[inline(never)]
     unsafe fn process(
@@ -289,16 +295,19 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
 pub struct ExplicitAvx2SingleBlock(());
 
 #[cfg(target_arch = "x86_64")]
-impl RowProcessor for ExplicitAvx2SingleBlock {
+impl ExplicitAvx2SingleBlock {
     #[inline]
-    fn new() -> Result<Self, ConversionError> {
+    pub fn try_new() -> Result<Self, ConversionError> {
         if is_x86_feature_detected!("avx2") {
             Ok(Self(()))
         } else {
             Err(ConversionError("avx2 is not supported on this machine"))
         }
     }
+}
 
+#[cfg(target_arch = "x86_64")]
+impl RowProcessor for ExplicitAvx2SingleBlock {
     #[inline(never)]
     #[target_feature(enable = "avx2")]
     unsafe fn process(
@@ -372,21 +381,132 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
     }
 }
 
+#[cfg(target_arch = "x86_64")]
+#[doc(hidden)]
+#[derive(Copy, Clone)]
+pub struct ExplicitSse2(());
+
+#[cfg(target_arch = "x86_64")]
+impl ExplicitSse2 {
+    #[inline]
+    pub fn new() -> Self {
+        // On x86_64 (unlike 32-bit x86), sse2 is mandatory.
+        Self(())
+    }
+
+    #[inline]
+    pub fn try_new() -> Result<Self, ConversionError> {
+        Ok(Self::new())
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl RowProcessor for ExplicitSse2 {
+    unsafe fn process(
+        self,
+        width: usize,
+        mut top_uyvy_addr: *const u8,
+        mut bot_uyvy_addr: *const u8,
+        mut top_y_addr: *mut u8,
+        mut bot_y_addr: *mut u8,
+        mut u_addr: *mut u8,
+        mut v_addr: *mut u8,
+    ) {
+        let mut i = 0;
+        const BLOCK_SIZE: usize = 32;
+        let low_bits = x86_64::_mm_set1_epi16(0xFF);
+        while i + BLOCK_SIZE <= width {
+            let load = |uyvy_addr: *const u8| -> [_; 4] {
+                std::array::from_fn(|i| x86_64::_mm_loadu_si128(uyvy_addr.add(16 * i) as _))
+            };
+            let top_uyvy = load(top_uyvy_addr);
+            let bot_uyvy = load(bot_uyvy_addr);
+            for (uyvy, y_addr) in [(top_uyvy, top_y_addr), (bot_uyvy, bot_y_addr)] {
+                x86_64::_mm_storeu_si128(
+                    y_addr as _,
+                    x86_64::_mm_packus_epi16(
+                        x86_64::_mm_srli_epi16(uyvy[0], 8),
+                        x86_64::_mm_srli_epi16(uyvy[1], 8),
+                    ),
+                );
+                x86_64::_mm_storeu_si128(
+                    y_addr.add(16) as _,
+                    x86_64::_mm_packus_epi16(
+                        x86_64::_mm_srli_epi16(uyvy[2], 8),
+                        x86_64::_mm_srli_epi16(uyvy[3], 8),
+                    ),
+                );
+            }
+            let uv = |uyvy: [x86_64::__m128i; 4]| {
+                [
+                    x86_64::_mm_packus_epi16(
+                        x86_64::_mm_and_si128(uyvy[0], low_bits),
+                        x86_64::_mm_and_si128(uyvy[1], low_bits),
+                    ),
+                    x86_64::_mm_packus_epi16(
+                        x86_64::_mm_and_si128(uyvy[2], low_bits),
+                        x86_64::_mm_and_si128(uyvy[3], low_bits),
+                    ),
+                ]
+            };
+            let top_uv = uv(top_uyvy);
+            let bot_uv = uv(bot_uyvy);
+            let uv = [
+                x86_64::_mm_avg_epu8(top_uv[0], bot_uv[0]),
+                x86_64::_mm_avg_epu8(top_uv[1], bot_uv[1]),
+            ];
+            let u = x86_64::_mm_packus_epi16(
+                x86_64::_mm_and_si128(uv[0], low_bits),
+                x86_64::_mm_and_si128(uv[1], low_bits),
+            );
+            x86_64::_mm_storeu_si128(u_addr as _, u);
+            let v = x86_64::_mm_packus_epi16(
+                x86_64::_mm_srli_epi16(uv[0], 8),
+                x86_64::_mm_srli_epi16(uv[1], 8),
+            );
+            x86_64::_mm_storeu_si128(v_addr as _, v);
+            i += BLOCK_SIZE;
+            top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
+            bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
+            top_y_addr = top_y_addr.add(BLOCK_SIZE);
+            bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
+            u_addr = u_addr.add(BLOCK_SIZE / 2);
+            v_addr = v_addr.add(BLOCK_SIZE / 2);
+        }
+        if i < width {
+            fallback(
+                width - i,
+                top_uyvy_addr,
+                bot_uyvy_addr,
+                top_y_addr,
+                bot_y_addr,
+                u_addr,
+                v_addr,
+            );
+        }
+    }
+}
+
 #[cfg(target_arch = "aarch64")]
 #[doc(hidden)]
 #[derive(Copy, Clone)]
 pub struct ExplicitNeon(());
 
 #[cfg(target_arch = "aarch64")]
-impl RowProcessor for ExplicitNeon {
+impl ExplicitNeon {
     fn new() -> Result<Self, ConversionError> {
-        if std::arch::is_aarch64_feature_detected!("neon") {
-            Ok(Self(()))
-        } else {
-            Err(ConversionError("neon unsupported on this machine"))
-        }
+        // On `aarch64` (unlike the 32-bit `arm`), NEON is mandatory.
+        Self(())
     }
 
+    #[inline]
+    pub fn try_new() -> Result<Self, ConversionError> {
+        Ok(Self::new())
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl RowProcessor for ExplicitNeon {
     #[inline(never)]
     #[target_feature(enable = "neon")]
     unsafe fn process(
@@ -447,16 +567,18 @@ macro_rules! auto {
         #[derive(Copy, Clone)]
         pub struct $ident<const PIXELS: usize>([(); PIXELS]);
 
-        impl<const PIXELS: usize> RowProcessor for $ident<PIXELS> {
+        impl<const PIXELS: usize> $ident<PIXELS> {
             #[inline(always)]
-            fn new() -> Result<Self, ConversionError> {
+            pub fn try_new() -> Result<Self, ConversionError> {
                 if true && $($supported)+ {
                     Ok(Self(std::array::from_fn(|_| ())))
                 } else {
                     Err(ConversionError(concat!(stringify!($ident), " unsupported on this machine")))
                 }
             }
+        }
 
+        impl<const PIXELS: usize> RowProcessor for $ident<PIXELS> {
             #[inline(never)]
             $(#[target_feature(enable = $feature)])*
             unsafe fn process(
@@ -545,7 +667,7 @@ mod tests {
                 /// Tests that a single `process` call produces the right `y` plane bytes.
                 #[test]
                 fn y() {
-                    let p = P::new().unwrap();
+                    let p = P::try_new().unwrap();
                     const PIXELS: usize = $pixels;
                     let mut top_in = vec![0xff; PIXELS * 4];
                     let mut bot_in = vec![0xff; PIXELS * 4];
@@ -577,7 +699,7 @@ mod tests {
                 /// Tests that a single `process` call produces the right `u` and `v` plane bytes.
                 #[test]
                 fn uv() {
-                    let p = P::new().unwrap();
+                    let p = P::try_new().unwrap();
                     const PIXELS: usize = $pixels;
                     let mut top_in = vec![0xff; PIXELS * 4];
                     let mut bot_in = vec![0xff; PIXELS * 4];
@@ -635,7 +757,7 @@ mod tests {
                         ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT).new_vec();
                     let expected_out = ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT)
                         .with_storage(&include_bytes!("testdata/out.yuv")[..]);
-                    super::super::convert_with::<P, _, _>(&uyvy_in, &mut actual_out).unwrap();
+                    super::super::convert_with(P::try_new().unwrap(), &uyvy_in, &mut actual_out).unwrap();
                     // `assert_eq!` output is unhelpful on these large binary arrays.
                     // On failure, it might be better to write to a file and diff with better tools,
                     // e.g.: `diff -u <(xxd src/testdata/out.yuv) <(xxd actual_out_auto.yuv)`
@@ -676,7 +798,7 @@ mod tests {
                     ][..]);
                     let mut actual_out =
                         ConsecutiveFrame::new(PixelFormat::I420, 3, 3).new_vec();
-                    super::super::convert_with::<P, _, _>(&uyvy_in, &mut actual_out).unwrap();
+                    super::super::convert_with(P::try_new().unwrap(), &uyvy_in, &mut actual_out).unwrap();
                     assert_eq!(expected_out.inner(), actual_out.inner());
                 }
             }
@@ -699,6 +821,10 @@ mod tests {
         32
     );
 
+    #[cfg(target_arch = "x86_64")]
+    #[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
+    test_processor!(super::super::ExplicitSse2, explicit_sse2, 32);
+
     #[cfg(target_arch = "x86_64")]
     #[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
     test_processor!(super::super::AutoAvx2Block<32>, auto_avx2, 32);