Skip to content

Commit

Permalink
pixconv: sse42-optimize swizzling RGB to RGBA
Browse files Browse the repository at this point in the history
PNG (as a source pixel format) is RGB or RGBA order. Wuffs' std/png
benchmarks, like the other image formats, decode to BGRA order. *If*
those benchmarks are altered to target RGBA instead, we get:

name                                                       old speed     new speed     delta

wuffs_png_decode_image_40k_24bpp/clang14                   279MB/s ± 0%  293MB/s ± 0%  +5.11%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang14                 289MB/s ± 0%  303MB/s ± 0%  +4.89%  (p=0.008 n=5+5)

wuffs_png_decode_image_40k_24bpp/gcc12                     311MB/s ± 0%  332MB/s ± 0%  +6.84%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc12                   325MB/s ± 0%  339MB/s ± 0%  +4.49%  (p=0.008 n=5+5)

When starting from 3-channel RGB, ending with RGBA is now as fast as
ending with BGRA.

Updates #141
  • Loading branch information
nigeltao committed Mar 25, 2024
1 parent 44bb271 commit a667685
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 0 deletions.
71 changes: 71 additions & 0 deletions internal/cgen/base/pixconv-submodule-regular.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@

#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
size_t dst_palette_len,
const uint8_t* src_ptr,
size_t src_len);

static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
Expand Down Expand Up @@ -3308,6 +3316,59 @@ wuffs_private_impl__swizzle_bgrw__bgrx(uint8_t* dst_ptr,

// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
size_t dst_palette_len,
const uint8_t* src_ptr,
size_t src_len) {
size_t dst_len4 = dst_len / 4;
size_t src_len3 = src_len / 3;
size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
uint8_t* d = dst_ptr;
const uint8_t* s = src_ptr;
size_t n = len;

__m128i shuffle = _mm_set_epi8(+0x00, +0x0B, +0x0A, +0x09, //
+0x00, +0x08, +0x07, +0x06, //
+0x00, +0x05, +0x04, +0x03, //
+0x00, +0x02, +0x01, +0x00);
__m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00);

while (n >= 6) {
__m128i x;
x = _mm_lddqu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, or_ff);
_mm_storeu_si128((__m128i*)(void*)d, x);

s += 4 * 3;
d += 4 * 4;
n -= 4;
}

while (n >= 1) {
uint8_t b0 = s[0];
uint8_t b1 = s[1];
uint8_t b2 = s[2];
d[0] = b0;
d[1] = b1;
d[2] = b2;
d[3] = 0xFF;

s += 1 * 3;
d += 1 * 4;
n -= 1;
}

return len;
}

WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
Expand Down Expand Up @@ -4793,6 +4854,11 @@ wuffs_private_impl__pixel_swizzler__prepare__bgr(
case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__BGRX:
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
if (wuffs_base__cpu_arch__have_x86_sse42()) {
return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
}
#endif
return wuffs_private_impl__swizzle_bgrw__bgr;

case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
Expand Down Expand Up @@ -5167,6 +5233,11 @@ wuffs_private_impl__pixel_swizzler__prepare__rgb(
case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__RGBX:
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
if (wuffs_base__cpu_arch__have_x86_sse42()) {
return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
}
#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
}
return NULL;
Expand Down
71 changes: 71 additions & 0 deletions release/c/wuffs-unsupported-snapshot.c
Original file line number Diff line number Diff line change
Expand Up @@ -20953,6 +20953,14 @@ wuffs_base__magic_number_guess_fourcc(wuffs_base__slice_u8 prefix_data,

#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
size_t dst_palette_len,
const uint8_t* src_ptr,
size_t src_len);

static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
Expand Down Expand Up @@ -24249,6 +24257,59 @@ wuffs_private_impl__swizzle_bgrw__bgrx(uint8_t* dst_ptr,

// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
size_t dst_palette_len,
const uint8_t* src_ptr,
size_t src_len) {
size_t dst_len4 = dst_len / 4;
size_t src_len3 = src_len / 3;
size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
uint8_t* d = dst_ptr;
const uint8_t* s = src_ptr;
size_t n = len;

__m128i shuffle = _mm_set_epi8(+0x00, +0x0B, +0x0A, +0x09, //
+0x00, +0x08, +0x07, +0x06, //
+0x00, +0x05, +0x04, +0x03, //
+0x00, +0x02, +0x01, +0x00);
__m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00, //
-0x01, +0x00, +0x00, +0x00);

while (n >= 6) {
__m128i x;
x = _mm_lddqu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, or_ff);
_mm_storeu_si128((__m128i*)(void*)d, x);

s += 4 * 3;
d += 4 * 4;
n -= 4;
}

while (n >= 1) {
uint8_t b0 = s[0];
uint8_t b1 = s[1];
uint8_t b2 = s[2];
d[0] = b0;
d[1] = b1;
d[2] = b2;
d[3] = 0xFF;

s += 1 * 3;
d += 1 * 4;
n -= 1;
}

return len;
}

WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
Expand Down Expand Up @@ -25734,6 +25795,11 @@ wuffs_private_impl__pixel_swizzler__prepare__bgr(
case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__BGRX:
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
if (wuffs_base__cpu_arch__have_x86_sse42()) {
return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
}
#endif
return wuffs_private_impl__swizzle_bgrw__bgr;

case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
Expand Down Expand Up @@ -26108,6 +26174,11 @@ wuffs_private_impl__pixel_swizzler__prepare__rgb(
case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__RGBX:
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
if (wuffs_base__cpu_arch__have_x86_sse42()) {
return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
}
#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
}
return NULL;
Expand Down

0 comments on commit a667685

Please sign in to comment.