Skip to content

Commit

Permalink
Using the F16C instruction with LDC on Intel AVX
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Piolat committed Dec 1, 2024
1 parent a9cb0e6 commit 6910caa
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions source/inteli/avxintrin.d
Original file line number Diff line number Diff line change
Expand Up @@ -4907,7 +4907,15 @@ __m128 _mm_cvtph_ps(__m128i a) pure @trusted
{
short8 sa = cast(short8)a;

// PERF F16C actual instruction
static if (LDC_with_F16C)
{
// Note: clang has a __builtin_ia32_vcvtph2ps256 but we don't
// Note: LLVM IR fpext leads to call __extendhfsf2@PLT
// Same with the pragma llvm.convert.from.fp16, so not sure
// what to do
return cast(__m128)__asm!(float4)("vcvtph2ps $1, $0", "=v,v", a);
}
else
{
// Reference: stb_image_resize2.h has F16C emulation.
// See:
Expand Down Expand Up @@ -4954,7 +4962,11 @@ unittest
/// Note: Preserve infinities, sign of zeroes, and NaN-ness.
__m256 _mm256_cvtph_ps(__m128i a) pure @trusted
{
// PERF F16C actual instruction
static if (LDC_with_F16C)
{
return __asm!(float8)("vcvtph2ps $1, $0", "=v,v", a);
}
else
{
// In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x
// so we do the same here.
Expand Down

0 comments on commit 6910caa

Please sign in to comment.