Skip to content

Commit

Permalink
SSE2 support
Browse files Browse the repository at this point in the history
This deinterleaves 8-bit pairs via the PACKUSWB instruction: either
shifting right to get the high 8 bits of every 16, or ANDing to get the
low 8 bits of every 16. I took that idea from compiling the following
code:

https://godbolt.org/z/enaMY7v4o

```rust

use std::arch::x86_64;
use std::simd::i8x16;

pub unsafe fn process(
    top_uyvy_addr: *const u8,
    bot_uyvy_addr: *const u8,
    top_y_addr: *mut u8,
    bot_y_addr: *mut u8,
    u_addr: *mut u8,
    v_addr: *mut u8,
) {
    let [top_uv, bot_uv] = [
        (top_uyvy_addr, top_y_addr),
        (bot_uyvy_addr, bot_y_addr),
    ].map(|(uyvy_addr, y_addr)| {
        let uyvy = std::ptr::read_unaligned(uyvy_addr as *const [i8x16; 4]);
        let (uv_hi, y_hi) = uyvy[0].deinterleave(uyvy[1]);
        let (uv_lo, y_lo) = uyvy[2].deinterleave(uyvy[3]);
        std::ptr::write_unaligned(y_addr as *mut i8x16, y_hi);
        std::ptr::write_unaligned(y_addr.add(16) as *mut i8x16, y_lo);
        [uv_hi, uv_lo]
    });
    let uv = [
        i8x16::from(x86_64::_mm_avg_epu8(top_uv[0].into(), bot_uv[0].into())),
        i8x16::from(x86_64::_mm_avg_epu8(top_uv[1].into(), bot_uv[1].into())),
    ];
    let (u, v) = uv[0].deinterleave(uv[1]);
    std::ptr::write_unaligned(u_addr as *mut i8x16, u);
    std::ptr::write_unaligned(v_addr as *mut i8x16, v);
}
```

Its performance is surprisingly good: 24 GB/s cold, 73 GB/s hot.
Some noise in all these measurements.

```
cold/memcpy_baseline    time:   [6.1385 ms 6.1503 ms 6.1639 ms]
                        thrpt:  [35.090 GiB/s 35.168 GiB/s 35.236 GiB/s]
                 change:
                        time:   [+0.8184% +1.1318% +1.4649%] (p = 0.00 < 0.05)
                        thrpt:  [-1.4438% -1.1191% -0.8117%]
                        Change within noise threshold.
cold/libyuv             time:   [8.9479 ms 8.9586 ms 8.9708 ms]
                        thrpt:  [24.111 GiB/s 24.144 GiB/s 24.173 GiB/s]
                 change:
                        time:   [+1.8972% +2.0382% +2.2017%] (p = 0.00 < 0.05)
                        thrpt:  [-2.1543% -1.9975% -1.8619%]
                        Performance has regressed.
cold/explicit_avx2_double
                        time:   [8.5369 ms 8.5513 ms 8.5665 ms]
                        thrpt:  [25.249 GiB/s 25.294 GiB/s 25.336 GiB/s]
                 change:
                        time:   [+12.556% +12.813% +13.076%] (p = 0.00 < 0.05)
                        thrpt:  [-11.564% -11.358% -11.155%]
                        Performance has regressed.
cold/explicit_avx2_single
                        time:   [8.0669 ms 8.0752 ms 8.0852 ms]
                        thrpt:  [26.752 GiB/s 26.785 GiB/s 26.812 GiB/s]
                 change:
                        time:   [+1.6387% +1.7825% +1.9258%] (p = 0.00 < 0.05)
                        thrpt:  [-1.8894% -1.7513% -1.6123%]
                        Performance has regressed.
cold/explicit_sse2      time:   [8.9443 ms 8.9541 ms 8.9652 ms]
                        thrpt:  [24.126 GiB/s 24.156 GiB/s 24.182 GiB/s]
cold/auto_avx2_64       time:   [32.122 ms 32.139 ms 32.158 ms]
                        thrpt:  [6.7260 GiB/s 6.7300 GiB/s 6.7335 GiB/s]
                 change:
                        time:   [+0.4201% +0.4922% +0.5622%] (p = 0.00 < 0.05)
                        thrpt:  [-0.5591% -0.4898% -0.4184%]
                        Change within noise threshold.
Benchmarking cold/auto_vanilla_64: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 5.3s, or reduce sample count to 90.
cold/auto_vanilla_64    time:   [53.057 ms 53.092 ms 53.132 ms]
                        thrpt:  [4.0708 GiB/s 4.0739 GiB/s 4.0766 GiB/s]
                 change:
                        time:   [-0.5970% -0.5149% -0.4217%] (p = 0.00 < 0.05)
                        thrpt:  [+0.4235% +0.5176% +0.6006%]
                        Change within noise threshold.

hot/memcpy_baseline     time:   [74.306 µs 74.385 µs 74.477 µs]
                        thrpt:  [90.755 GiB/s 90.867 GiB/s 90.964 GiB/s]
                 change:
                        time:   [-0.1252% +0.1649% +0.5301%] (p = 0.39 > 0.05)
                        thrpt:  [-0.5273% -0.1646% +0.1254%]
                        No change in performance detected.
hot/libyuv              time:   [107.00 µs 107.04 µs 107.09 µs]
                        thrpt:  [63.116 GiB/s 63.145 GiB/s 63.170 GiB/s]
                 change:
                        time:   [+4.9819% +5.1083% +5.2261%] (p = 0.00 < 0.05)
                        thrpt:  [-4.9665% -4.8600% -4.7455%]
                        Performance has regressed.
hot/explicit_avx2_double
                        time:   [90.068 µs 90.113 µs 90.155 µs]
                        thrpt:  [74.973 GiB/s 75.008 GiB/s 75.045 GiB/s]
                 change:
                        time:   [+19.614% +20.304% +21.006%] (p = 0.00 < 0.05)
                        thrpt:  [-17.360% -16.877% -16.398%]
                        Performance has regressed.
hot/explicit_avx2_single
                        time:   [79.458 µs 79.556 µs 79.655 µs]
                        thrpt:  [84.856 GiB/s 84.961 GiB/s 85.066 GiB/s]
                 change:
                        time:   [+6.9429% +7.3397% +7.6897%] (p = 0.00 < 0.05)
                        thrpt:  [-7.1406% -6.8378% -6.4921%]
                        Performance has regressed.
hot/explicit_sse2       time:   [92.316 µs 92.406 µs 92.511 µs]
                        thrpt:  [73.063 GiB/s 73.146 GiB/s 73.218 GiB/s]
hot/auto_avx2_64        time:   [920.02 µs 920.20 µs 920.42 µs]
                        thrpt:  [7.3435 GiB/s 7.3453 GiB/s 7.3467 GiB/s]
                 change:
                        time:   [+0.7899% +0.8556% +0.9185%] (p = 0.00 < 0.05)
                        thrpt:  [-0.9102% -0.8483% -0.7837%]
                        Change within noise threshold.
Benchmarking hot/auto_vanilla_64: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.1s, enable flat sampling, or reduce sample count to 50.
hot/auto_vanilla_64     time:   [1.6063 ms 1.6069 ms 1.6075 ms]
                        thrpt:  [4.2048 GiB/s 4.2064 GiB/s 4.2078 GiB/s]
                 change:
                        time:   [-0.8669% -0.8069% -0.7479%] (p = 0.00 < 0.05)
                        thrpt:  [+0.7536% +0.8134% +0.8745%]
                        Change within noise threshold.
```
  • Loading branch information
scottlamb committed Sep 19, 2024
1 parent 3c5c422 commit 4beb1f8
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 41 deletions.
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ Limitations and future work:
[UYVY](https://fourcc.org/pixel-format/yuv-uyvy/) to
[I420](https://fourcc.org/pixel-format/yuv-i420/).
More will be added as needed.
* Returns `Err` on x86\_64 CPUs that don't support
AVX2. We'll likely add an SSE2 fallback later. As SSE2 is in the core
x86\_64 instruction set, this would mean all x86\_64 CPUs would be supported.
* Expects to process full horizontal lines. This is likely to
change to allow working on cropped regions.
* Does not support output to a frame with padding, as required by some
Expand Down
34 changes: 19 additions & 15 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,14 @@ fn bench_common<const FRAMES_PER_ITER: usize>(
(inputs.len() * (WIDTH * HEIGHT * 7) / 2) as u64,
));
macro_rules! bench_block {
($name:literal, $impl:ty) => {
($name:literal, $p:expr) => {
let p = $p;
g.bench_function($name, |b| {
b.iter(|| {
for i in &inputs {
black_box(
convert_with::<$impl, _, _>(
i,
&mut ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT)
.new_vec(),
)
.unwrap(),
);
let mut f =
ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT).new_vec();
black_box(convert_with(p, i, &mut f).unwrap());
}
})
});
Expand Down Expand Up @@ -144,16 +140,24 @@ fn bench_common<const FRAMES_PER_ITER: usize>(
})
});
#[cfg(target_arch = "x86_64")]
bench_block!("explicit_avx2_double", ExplicitAvx2DoubleBlock);
bench_block!(
"explicit_avx2_double",
ExplicitAvx2DoubleBlock::try_new().unwrap()
);
#[cfg(target_arch = "x86_64")]
bench_block!("explicit_avx2_single", ExplicitAvx2SingleBlock);
bench_block!(
"explicit_avx2_single",
ExplicitAvx2SingleBlock::try_new().unwrap()
);
#[cfg(target_arch = "x86_64")]
bench_block!("auto_avx2_64", AutoAvx2Block<64>);
bench_block!("explicit_sse2", ExplicitSse2::new());
#[cfg(target_arch = "x86_64")]
bench_block!("auto_avx2_64", AutoAvx2Block::<64>::try_new().unwrap());
#[cfg(target_arch = "aarch64")]
bench_block!("explicit_neon", ExplicitNeon);
bench_block!("explicit_neon", ExplicitNeon::try_new().unwrap());
#[cfg(target_arch = "aarch64")]
bench_block!("auto_neon_64", AutoNeonBlock<64>);
bench_block!("auto_vanilla_64", AutoVanillaBlock<64>);
bench_block!("auto_neon_64", AutoNeonBlock<64>::new().unwrap());
bench_block!("auto_vanilla_64", AutoVanillaBlock::<64>::try_new().unwrap());
g.finish();
}

Expand Down
172 changes: 149 additions & 23 deletions src/uyvy_to_i420.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ use crate::{
/// Processes a block of 2 rows.
#[doc(hidden)]
pub trait RowProcessor: Copy + Clone + Sized + Send + Sync {
/// Returns true if this block type is supported on this machine.
fn new() -> Result<Self, ConversionError>;

/// Processes a block `width` pixels wide, two rows high.
///
/// # Safety
Expand Down Expand Up @@ -54,17 +51,24 @@ pub fn convert<FI: Frame, FO: FrameMut>(
yuv_out: &mut FO,
) -> Result<(), ConversionError> {
#[cfg(target_arch = "x86_64")]
return convert_with::<ExplicitAvx2DoubleBlock, _, _>(uyvy_in, yuv_out);
{
if let Ok(avx2) = ExplicitAvx2DoubleBlock::try_new() {
return convert_with(avx2, uyvy_in, yuv_out);
}
return convert_with(ExplicitSse2::new(), uyvy_in, yuv_out);
}

// NEON is always supported on `aarch64`.
#[cfg(target_arch = "aarch64")]
return convert_with::<ExplicitNeon, _, _>(uyvy_in, yuv_out);
return convert_with(ExplicitNeon::new(), uyvy_in, yuv_out);

#[allow(unused)]
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
Err(ConversionError("no block processor available"))
}

#[doc(hidden)]
pub fn convert_with<P: RowProcessor, FI: Frame, FO: FrameMut>(
p: P,
uyvy_in: &FI,
yuv_out: &mut FO,
) -> Result<(), ConversionError> {
Expand All @@ -76,7 +80,6 @@ pub fn convert_with<P: RowProcessor, FI: Frame, FO: FrameMut>(
{
return Err(ConversionError("invalid arguments"));
}
let p = P::new()?;
let pixels = width * height;
let uyvy_planes = uyvy_in.planes();
let [uyvy_in] = &uyvy_planes[..] else {
Expand Down Expand Up @@ -191,16 +194,19 @@ unsafe fn fallback(
pub struct ExplicitAvx2DoubleBlock(());

#[cfg(target_arch = "x86_64")]
impl RowProcessor for ExplicitAvx2DoubleBlock {
impl ExplicitAvx2DoubleBlock {
#[inline]
fn new() -> Result<Self, ConversionError> {
pub fn try_new() -> Result<Self, ConversionError> {
if is_x86_feature_detected!("avx2") {
Ok(Self(()))
} else {
Err(ConversionError("avx2 is not supported on this machine"))
}
}
}

#[cfg(target_arch = "x86_64")]
impl RowProcessor for ExplicitAvx2DoubleBlock {
#[target_feature(enable = "avx2")]
#[inline(never)]
unsafe fn process(
Expand Down Expand Up @@ -289,16 +295,19 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
pub struct ExplicitAvx2SingleBlock(());

#[cfg(target_arch = "x86_64")]
impl RowProcessor for ExplicitAvx2SingleBlock {
impl ExplicitAvx2SingleBlock {
#[inline]
fn new() -> Result<Self, ConversionError> {
pub fn try_new() -> Result<Self, ConversionError> {
if is_x86_feature_detected!("avx2") {
Ok(Self(()))
} else {
Err(ConversionError("avx2 is not supported on this machine"))
}
}
}

#[cfg(target_arch = "x86_64")]
impl RowProcessor for ExplicitAvx2SingleBlock {
#[inline(never)]
#[target_feature(enable = "avx2")]
unsafe fn process(
Expand Down Expand Up @@ -372,21 +381,132 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
}
}

#[cfg(target_arch = "x86_64")]
#[doc(hidden)]
#[derive(Copy, Clone)]
pub struct ExplicitSse2(());

#[cfg(target_arch = "x86_64")]
impl ExplicitSse2 {
#[inline]
pub fn new() -> Self {
// On x86_64 (unlike 32-bit x86), sse2 is mandatory.
Self(())
}

#[inline]
pub fn try_new() -> Result<Self, ConversionError> {
Ok(Self::new())
}
}

#[cfg(target_arch = "x86_64")]
impl RowProcessor for ExplicitSse2 {
unsafe fn process(
self,
width: usize,
mut top_uyvy_addr: *const u8,
mut bot_uyvy_addr: *const u8,
mut top_y_addr: *mut u8,
mut bot_y_addr: *mut u8,
mut u_addr: *mut u8,
mut v_addr: *mut u8,
) {
let mut i = 0;
const BLOCK_SIZE: usize = 32;
let low_bits = x86_64::_mm_set1_epi16(0xFF);
while i + BLOCK_SIZE <= width {
let load = |uyvy_addr: *const u8| -> [_; 4] {
std::array::from_fn(|i| x86_64::_mm_loadu_si128(uyvy_addr.add(16 * i) as _))
};
let top_uyvy = load(top_uyvy_addr);
let bot_uyvy = load(bot_uyvy_addr);
for (uyvy, y_addr) in [(top_uyvy, top_y_addr), (bot_uyvy, bot_y_addr)] {
x86_64::_mm_storeu_si128(
y_addr as _,
x86_64::_mm_packus_epi16(
x86_64::_mm_srli_epi16(uyvy[0], 8),
x86_64::_mm_srli_epi16(uyvy[1], 8),
),
);
x86_64::_mm_storeu_si128(
y_addr.add(16) as _,
x86_64::_mm_packus_epi16(
x86_64::_mm_srli_epi16(uyvy[2], 8),
x86_64::_mm_srli_epi16(uyvy[3], 8),
),
);
}
let uv = |uyvy: [x86_64::__m128i; 4]| {
[
x86_64::_mm_packus_epi16(
x86_64::_mm_and_si128(uyvy[0], low_bits),
x86_64::_mm_and_si128(uyvy[1], low_bits),
),
x86_64::_mm_packus_epi16(
x86_64::_mm_and_si128(uyvy[2], low_bits),
x86_64::_mm_and_si128(uyvy[3], low_bits),
),
]
};
let top_uv = uv(top_uyvy);
let bot_uv = uv(bot_uyvy);
let uv = [
x86_64::_mm_avg_epu8(top_uv[0], bot_uv[0]),
x86_64::_mm_avg_epu8(top_uv[1], bot_uv[1]),
];
let u = x86_64::_mm_packus_epi16(
x86_64::_mm_and_si128(uv[0], low_bits),
x86_64::_mm_and_si128(uv[1], low_bits),
);
x86_64::_mm_storeu_si128(u_addr as _, u);
let v = x86_64::_mm_packus_epi16(
x86_64::_mm_srli_epi16(uv[0], 8),
x86_64::_mm_srli_epi16(uv[1], 8),
);
x86_64::_mm_storeu_si128(v_addr as _, v);
i += BLOCK_SIZE;
top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
top_y_addr = top_y_addr.add(BLOCK_SIZE);
bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
u_addr = u_addr.add(BLOCK_SIZE / 2);
v_addr = v_addr.add(BLOCK_SIZE / 2);
}
if i < width {
fallback(
width - i,
top_uyvy_addr,
bot_uyvy_addr,
top_y_addr,
bot_y_addr,
u_addr,
v_addr,
);
}
}
}

#[cfg(target_arch = "aarch64")]
#[doc(hidden)]
#[derive(Copy, Clone)]
pub struct ExplicitNeon(());

#[cfg(target_arch = "aarch64")]
impl RowProcessor for ExplicitNeon {
impl ExplicitNeon {
fn new() -> Result<Self, ConversionError> {
if std::arch::is_aarch64_feature_detected!("neon") {
Ok(Self(()))
} else {
Err(ConversionError("neon unsupported on this machine"))
}
// On `aarch64` (unlike the 32-bit `arm`), NEON is mandatory.
Self(())
}

#[inline]
pub fn try_new() -> Result<Self, ConversionError> {
Ok(Self::new())
}
}

#[cfg(target_arch = "aarch64")]
impl RowProcessor for ExplicitNeon {
#[inline(never)]
#[target_feature(enable = "neon")]
unsafe fn process(
Expand Down Expand Up @@ -447,16 +567,18 @@ macro_rules! auto {
#[derive(Copy, Clone)]
pub struct $ident<const PIXELS: usize>([(); PIXELS]);

impl<const PIXELS: usize> RowProcessor for $ident<PIXELS> {
impl<const PIXELS: usize> $ident<PIXELS> {
#[inline(always)]
fn new() -> Result<Self, ConversionError> {
pub fn try_new() -> Result<Self, ConversionError> {
if true && $($supported)+ {
Ok(Self(std::array::from_fn(|_| ())))
} else {
Err(ConversionError(concat!(stringify!($ident), " unsupported on this machine")))
}
}
}

impl<const PIXELS: usize> RowProcessor for $ident<PIXELS> {
#[inline(never)]
$(#[target_feature(enable = $feature)])*
unsafe fn process(
Expand Down Expand Up @@ -545,7 +667,7 @@ mod tests {
/// Tests that a single `process` call produces the right `y` plane bytes.
#[test]
fn y() {
let p = P::new().unwrap();
let p = P::try_new().unwrap();
const PIXELS: usize = $pixels;
let mut top_in = vec![0xff; PIXELS * 4];
let mut bot_in = vec![0xff; PIXELS * 4];
Expand Down Expand Up @@ -577,7 +699,7 @@ mod tests {
/// Tests that a single `process` call produces the right `u` and `v` plane bytes.
#[test]
fn uv() {
let p = P::new().unwrap();
let p = P::try_new().unwrap();
const PIXELS: usize = $pixels;
let mut top_in = vec![0xff; PIXELS * 4];
let mut bot_in = vec![0xff; PIXELS * 4];
Expand Down Expand Up @@ -635,7 +757,7 @@ mod tests {
ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT).new_vec();
let expected_out = ConsecutiveFrame::new(PixelFormat::I420, WIDTH, HEIGHT)
.with_storage(&include_bytes!("testdata/out.yuv")[..]);
super::super::convert_with::<P, _, _>(&uyvy_in, &mut actual_out).unwrap();
super::super::convert_with(P::try_new().unwrap(), &uyvy_in, &mut actual_out).unwrap();
// `assert_eq!` output is unhelpful on these large binary arrays.
// On failure, it might be better to write to a file and diff with better tools,
// e.g.: `diff -u <(xxd src/testdata/out.yuv) <(xxd actual_out_auto.yuv)`
Expand Down Expand Up @@ -676,7 +798,7 @@ mod tests {
][..]);
let mut actual_out =
ConsecutiveFrame::new(PixelFormat::I420, 3, 3).new_vec();
super::super::convert_with::<P, _, _>(&uyvy_in, &mut actual_out).unwrap();
super::super::convert_with(P::try_new().unwrap(), &uyvy_in, &mut actual_out).unwrap();
assert_eq!(expected_out.inner(), actual_out.inner());
}
}
Expand All @@ -699,6 +821,10 @@ mod tests {
32
);

#[cfg(target_arch = "x86_64")]
#[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
test_processor!(super::super::ExplicitSse2, explicit_sse2, 32);

#[cfg(target_arch = "x86_64")]
#[cfg(not(miri))] // vendor instrinsics are unsupported on miri.
test_processor!(super::super::AutoAvx2Block<32>, auto_avx2, 32);
Expand Down

0 comments on commit 4beb1f8

Please sign in to comment.