Skip to content

Commit

Permalink
Fix regression (#5)
Browse files Browse the repository at this point in the history
* fix performance regression

Currently if I run benchmarks as of e2320de I get the following result:

```
hot/explicit_avx2_double
                        time:   [1.1531 ms 1.1539 ms 1.1548 ms]
                        thrpt:  [5.8529 GiB/s 5.8577 GiB/s 5.8620 GiB/s]
```

If I disassemble the function (though only on the actual binary, not
with `cargo-show-asm`, annoyingly), I see out-of-line calls into
iterator stuff, and probably a bunch of associated dumps of registers to
memory and whatnot.

```
(lldb) disas
bench-1f3b1b5d384583ba`_$LT$pixelfmt..uyvy_to_i420..ExplicitAvx2DoubleBlock$u20$as$u20$pixelfmt..uyvy_to_i420..RowProcessor$GT$::process::h94bb8a0388c0d852:
...
    0x5555555c0f3b <+155>:  vzeroupper
    0x5555555c0f3e <+158>:  callq  0x5555555c0a80 ; core::array::drain::drain_array_with::hce9734ca2363f2b8
->  0x5555555c0f43 <+163>:  movq   0x48(%rsp), %rdx
    0x5555555c0f48 <+168>:  movq   %rbx, %r11
    0x5555555c0f4b <+171>:  movq   %r12, %r9
```

This doesn't happen at the previous commit.

With a slight refactor, this `callq` goes away and performance returns.

```
hot/explicit_avx2_double
                        time:   [72.312 µs 72.370 µs 72.436 µs]
                        thrpt:  [93.312 GiB/s 93.398 GiB/s 93.472 GiB/s]
                 change:
                        time:   [-93.730% -93.717% -93.703%] (p = 0.00 < 0.05)
                        thrpt:  [+1488.2% +1491.6% +1494.8%]
                        Performance has improved.
```

I didn't record the Rust version I was using when preparing that change,
so I'm unsure if I just missed the benchmark change or if it's dependent
on Rust version.

This is not entirely satisfying; I wanted to at least do a test on
`cargo-show-asm` output that there are no unexpected `call`
instructions, but it doesn't show them anyway...

* prettier loops

Not sure why I used `loop { if ... { break } }` rather than `while` to
begin with, but easy enough to fix. No performance impact.
  • Loading branch information
scottlamb authored Sep 19, 2024
1 parent e2320de commit 3c5c422
Showing 1 changed file with 49 additions and 53 deletions.
102 changes: 49 additions & 53 deletions src/uyvy_to_i420.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ unsafe fn hexprint(v: std::arch::x86_64::__m256i) -> impl std::fmt::Display {
)
}

#[inline(never)]
unsafe fn fallback(
width: usize,
top_uyvy_addr: *const u8,
Expand Down Expand Up @@ -205,12 +206,12 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
unsafe fn process(
self,
width: usize,
top_uyvy_addr: *const u8,
bot_uyvy_addr: *const u8,
top_y_addr: *mut u8,
bot_y_addr: *mut u8,
u_addr: *mut u8,
v_addr: *mut u8,
mut top_uyvy_addr: *const u8,
mut bot_uyvy_addr: *const u8,
mut top_y_addr: *mut u8,
mut bot_y_addr: *mut u8,
mut u_addr: *mut u8,
mut v_addr: *mut u8,
) {
// Put data[i] into 32-bit groups: lower 128-bits = (y0 y1 u0 v0) upper = (y2 y3 u1 v1).
// Source indexes, applied to each 128-bit lane within the 256-bit register.
Expand All @@ -220,26 +221,19 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
));

// Process the nice blocks.
const BLOCK_SIZE: usize = 64;
let mut i = 0;
loop {
let top_uyvy_addr = top_uyvy_addr.add(2 * i);
let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
let top_y_addr = top_y_addr.add(i);
let bot_y_addr = bot_y_addr.add(i);
let u_addr = u_addr.add(i / 2);
let v_addr = v_addr.add(i / 2);
if i + BLOCK_SIZE > width {
break;
}
let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 4] {
const BLOCK_SIZE: usize = 64;
while i + BLOCK_SIZE <= width {
let load = |uyvy_addr: *const u8| -> [_; 4] {
std::array::from_fn(|i| {
// VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
// VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
})
});
};
let top = load(top_uyvy_addr);
let bot = load(bot_uyvy_addr);
for (data, addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
for i in [0, 1] {
// Put into 64-groups: y0 y2 y1 y3.
Expand Down Expand Up @@ -268,16 +262,22 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix));
x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix));
i += BLOCK_SIZE;
top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
top_y_addr = top_y_addr.add(BLOCK_SIZE);
bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
u_addr = u_addr.add(BLOCK_SIZE / 2);
v_addr = v_addr.add(BLOCK_SIZE / 2);
}
if i < width {
fallback(
width - i,
top_uyvy_addr.add(2 * i),
bot_uyvy_addr.add(2 * i),
top_y_addr.add(i),
bot_y_addr.add(i),
u_addr.add(i / 2),
v_addr.add(i / 2),
top_uyvy_addr,
bot_uyvy_addr,
top_y_addr,
bot_y_addr,
u_addr,
v_addr,
);
}
}
Expand All @@ -304,12 +304,12 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
unsafe fn process(
self,
width: usize,
top_uyvy_addr: *const u8,
bot_uyvy_addr: *const u8,
top_y_addr: *mut u8,
bot_y_addr: *mut u8,
u_addr: *mut u8,
v_addr: *mut u8,
mut top_uyvy_addr: *const u8,
mut bot_uyvy_addr: *const u8,
mut top_y_addr: *mut u8,
mut bot_y_addr: *mut u8,
mut u_addr: *mut u8,
mut v_addr: *mut u8,
) {
// Put data[i] into 32-bit groups: lower 128-bits = (y0 y1 u0 v0) upper = (y2 y3 u1 v1).
// Source indexes, applied to each 128-bit lane within the 256-bit register.
Expand All @@ -320,24 +320,17 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
// Process the nice blocks.
const BLOCK_SIZE: usize = 32;
let mut i = 0;
loop {
let top_uyvy_addr = top_uyvy_addr.add(2 * i);
let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
let top_y_addr = top_y_addr.add(i);
let bot_y_addr = bot_y_addr.add(i);
let u_addr = u_addr.add(i / 2);
let v_addr = v_addr.add(i / 2);
if i + BLOCK_SIZE > width {
break;
}
let [top, bot] = [top_uyvy_addr, bot_uyvy_addr].map(|uyvy_addr| -> [_; 2] {
while i + BLOCK_SIZE <= width {
let load = |uyvy_addr: *const u8| -> [_; 2] {
std::array::from_fn(|i| {
// VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
let raw = x86_64::_mm256_loadu_si256(uyvy_addr.add(32 * i) as _);
// VPSHUFB (YMM, YMM, YMM) on Zen2: lat 1; cpi 0.5; ports 1*FP12.
x86_64::_mm256_shuffle_epi8(raw, shuf_indices)
})
});
};
let top = load(top_uyvy_addr);
let bot = load(bot_uyvy_addr);
for (data, y_addr) in [(top, top_y_addr), (bot, bot_y_addr)] {
let y = x86_64::_mm256_unpacklo_epi64(data[0], data[1]);
// VMOVDQU (M256, YMM) on Zen2: ports 1*FP2.
Expand All @@ -358,16 +351,22 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
x86_64::_mm256_permutevar8x32_epi32(uv, p),
);
i += BLOCK_SIZE;
top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
top_y_addr = top_y_addr.add(BLOCK_SIZE);
bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
u_addr = u_addr.add(BLOCK_SIZE / 2);
v_addr = v_addr.add(BLOCK_SIZE / 2);
}
if i < width {
fallback(
width - i,
top_uyvy_addr.add(2 * i),
bot_uyvy_addr.add(2 * i),
top_y_addr.add(i),
bot_y_addr.add(i),
u_addr.add(i / 2),
v_addr.add(i / 2),
top_uyvy_addr,
bot_uyvy_addr,
top_y_addr,
bot_y_addr,
u_addr,
v_addr,
);
}
}
Expand Down Expand Up @@ -402,10 +401,7 @@ impl RowProcessor for ExplicitNeon {
) {
const BLOCK_SIZE: usize = 32;
let mut i = 0;
loop {
if i + BLOCK_SIZE > width {
break;
}
while i + BLOCK_SIZE <= width {
let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr.add(2 * i));
let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr.add(2 * i));
aarch64::vst2q_u8(
Expand Down

0 comments on commit 3c5c422

Please sign in to comment.