Skip to content

Commit

Permalink
prettier loops
Browse files Browse the repository at this point in the history
Not sure why I used `loop { if ... { break } }` rather than `while` to
begin with, but easy enough to fix. No performance impact.
  • Loading branch information
scottlamb committed Sep 19, 2024
1 parent 974e6e9 commit f70c811
Showing 1 changed file with 40 additions and 49 deletions.
89 changes: 40 additions & 49 deletions src/uyvy_to_i420.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,12 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
unsafe fn process(
self,
width: usize,
top_uyvy_addr: *const u8,
bot_uyvy_addr: *const u8,
top_y_addr: *mut u8,
bot_y_addr: *mut u8,
u_addr: *mut u8,
v_addr: *mut u8,
mut top_uyvy_addr: *const u8,
mut bot_uyvy_addr: *const u8,
mut top_y_addr: *mut u8,
mut bot_y_addr: *mut u8,
mut u_addr: *mut u8,
mut v_addr: *mut u8,
) {
// Put data[i] into 32-bit groups: lower 128-bits = (y0 y1 u0 v0) upper = (y2 y3 u1 v1).
// Source indexes, applied to each 128-bit lane within the 256-bit register.
Expand All @@ -221,18 +221,9 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
));

// Process the nice blocks.
const BLOCK_SIZE: usize = 64;
let mut i = 0;
loop {
let top_uyvy_addr = top_uyvy_addr.add(2 * i);
let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
let top_y_addr = top_y_addr.add(i);
let bot_y_addr = bot_y_addr.add(i);
let u_addr = u_addr.add(i / 2);
let v_addr = v_addr.add(i / 2);
if i + BLOCK_SIZE > width {
break;
}
const BLOCK_SIZE: usize = 64;
while i + BLOCK_SIZE <= width {
let load = |uyvy_addr: *const u8| -> [_; 4] {
std::array::from_fn(|i| {
// VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
Expand Down Expand Up @@ -271,16 +262,22 @@ impl RowProcessor for ExplicitAvx2DoubleBlock {
x86_64::_mm256_storeu_si256(u_addr as _, x86_64::_mm256_unpacklo_epi32(uv0prime, mix));
x86_64::_mm256_storeu_si256(v_addr as _, x86_64::_mm256_unpackhi_epi32(uv0prime, mix));
i += BLOCK_SIZE;
top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
top_y_addr = top_y_addr.add(BLOCK_SIZE);
bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
u_addr = u_addr.add(BLOCK_SIZE / 2);
v_addr = v_addr.add(BLOCK_SIZE / 2);
}
if i < width {
fallback(
width - i,
top_uyvy_addr.add(2 * i),
bot_uyvy_addr.add(2 * i),
top_y_addr.add(i),
bot_y_addr.add(i),
u_addr.add(i / 2),
v_addr.add(i / 2),
top_uyvy_addr,
bot_uyvy_addr,
top_y_addr,
bot_y_addr,
u_addr,
v_addr,
);
}
}
Expand All @@ -307,12 +304,12 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
unsafe fn process(
self,
width: usize,
top_uyvy_addr: *const u8,
bot_uyvy_addr: *const u8,
top_y_addr: *mut u8,
bot_y_addr: *mut u8,
u_addr: *mut u8,
v_addr: *mut u8,
mut top_uyvy_addr: *const u8,
mut bot_uyvy_addr: *const u8,
mut top_y_addr: *mut u8,
mut bot_y_addr: *mut u8,
mut u_addr: *mut u8,
mut v_addr: *mut u8,
) {
// Put data[i] into 32-bit groups: lower 128-bits = (y0 y1 u0 v0) upper = (y2 y3 u1 v1).
// Source indexes, applied to each 128-bit lane within the 256-bit register.
Expand All @@ -323,16 +320,7 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
// Process the nice blocks.
const BLOCK_SIZE: usize = 32;
let mut i = 0;
loop {
let top_uyvy_addr = top_uyvy_addr.add(2 * i);
let bot_uyvy_addr = bot_uyvy_addr.add(2 * i);
let top_y_addr = top_y_addr.add(i);
let bot_y_addr = bot_y_addr.add(i);
let u_addr = u_addr.add(i / 2);
let v_addr = v_addr.add(i / 2);
if i + BLOCK_SIZE > width {
break;
}
while i + BLOCK_SIZE <= width {
let load = |uyvy_addr: *const u8| -> [_; 2] {
std::array::from_fn(|i| {
// VMOVDQU (YMM, M256) on Zen2: lat <8, cpi 0.5
Expand Down Expand Up @@ -363,16 +351,22 @@ impl RowProcessor for ExplicitAvx2SingleBlock {
x86_64::_mm256_permutevar8x32_epi32(uv, p),
);
i += BLOCK_SIZE;
top_uyvy_addr = top_uyvy_addr.add(2 * BLOCK_SIZE);
bot_uyvy_addr = bot_uyvy_addr.add(2 * BLOCK_SIZE);
top_y_addr = top_y_addr.add(BLOCK_SIZE);
bot_y_addr = bot_y_addr.add(BLOCK_SIZE);
u_addr = u_addr.add(BLOCK_SIZE / 2);
v_addr = v_addr.add(BLOCK_SIZE / 2);
}
if i < width {
fallback(
width - i,
top_uyvy_addr.add(2 * i),
bot_uyvy_addr.add(2 * i),
top_y_addr.add(i),
bot_y_addr.add(i),
u_addr.add(i / 2),
v_addr.add(i / 2),
top_uyvy_addr,
bot_uyvy_addr,
top_y_addr,
bot_y_addr,
u_addr,
v_addr,
);
}
}
Expand Down Expand Up @@ -407,10 +401,7 @@ impl RowProcessor for ExplicitNeon {
) {
const BLOCK_SIZE: usize = 32;
let mut i = 0;
loop {
if i + BLOCK_SIZE > width {
break;
}
while i + BLOCK_SIZE <= width {
let top_uyvy = aarch64::vld4q_u8(top_uyvy_addr.add(2 * i));
let bot_uyvy = aarch64::vld4q_u8(bot_uyvy_addr.add(2 * i));
aarch64::vst2q_u8(
Expand Down

0 comments on commit f70c811

Please sign in to comment.