From 84b026690ded7f7728b6d1ba48b233b6ca8317eb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 6 Jun 2024 16:44:07 +0200 Subject: [PATCH] DAG: Pass flags to FoldConstantArithmetic (#93663) There is simply way too much going on inside getNode. The complicated constant folding of vector handling works by looking for build_vector operands, and then tries to getNode the scalar element and then checks if constants were the result. As a side effect, this produces unused scalar operation nodes (previously, without flags). If the vector operation were later scalarized, it would find the flagless constant folding temporary and lose the flag. I don't think this is a reasonable way for constant folding to operate, but for now fix this by ensuring flags on the original operation are preserved in the temporary. This yields a clear code improvement for AMDGPU when f16 isn't legal. The Wasm cases switch from using a libcall to compare and select. We are evidently missing the fcmp+select to fminimum/fmaximum handling, but this would be further improved when that's handled. AArch64 also avoids the libcall, but looks worse and has a different call for some reason. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 3 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 +- .../AArch64/vecreduce-fmax-legalization.ll | 14 +- .../AArch64/vecreduce-fmin-legalization.ll | 14 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 138 ++++------- llvm/test/CodeGen/WebAssembly/simd-arith.ll | 220 ++++++++++-------- 6 files changed, 202 insertions(+), 194 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 48cb0cdf851cf0..7b0e5e7d9504bf 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1893,7 +1893,8 @@ class SelectionDAG { const SDNode *N2); SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, - ArrayRef Ops); + ArrayRef Ops, + SDNodeFlags Flags = SDNodeFlags()); /// Fold floating-point operations when all operands are constants and/or /// undefined. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 09cdec8adb2755..e176cf2cc2a6cf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6333,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef Ops) { } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, - EVT VT, ArrayRef Ops) { + EVT VT, ArrayRef Ops, + SDNodeFlags Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. @@ -6690,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, } // Constant fold the scalar operands. - SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps); + SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags); // Legalize the (integer) scalar constant if necessary. if (LegalSVT != SVT) @@ -7261,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2})) + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) return SV; // Canonicalize an UNDEF to the RHS, even over a constant. diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 4c02a5240ba6af..c993051ccebf7c 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: b fmaxl +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: b.le .LBB18_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll index 18d40cb18ba609..0116be51dd696a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: b fminl +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: b.ge .LBB18_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 7d7a462597104f..0c13c5348879b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -654,21 +654,16 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan: @@ -847,21 +842,16 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX7-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan_nsz: @@ -1216,28 +1206,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan: @@ -1427,28 +1410,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX7-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan_nsz: @@ -1671,35 +1647,26 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan: @@ -1924,35 +1891,26 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX7-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan_nsz: diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index 761a75418a00f8..67388b688e3bb7 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -11788,27 +11788,35 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32: ; NO-SIMD128: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8 -; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7 -; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6 -; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5 -; NO-SIMD128-NEXT: f32.store 0($0), $pop3 +; NO-SIMD128-NEXT: f32.lt $push0=, $4, $8 +; NO-SIMD128-NEXT: f32.select $push1=, $4, $8, $pop0 +; NO-SIMD128-NEXT: f32.store 12($0), $pop1 +; NO-SIMD128-NEXT: f32.lt $push2=, $3, $7 +; NO-SIMD128-NEXT: f32.select $push3=, $3, $7, $pop2 +; NO-SIMD128-NEXT: f32.store 8($0), $pop3 +; NO-SIMD128-NEXT: f32.lt $push4=, $2, $6 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $6, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.lt $push6=, $1, $5 +; NO-SIMD128-NEXT: f32.select $push7=, $1, $5, $pop6 +; NO-SIMD128-NEXT: f32.store 0($0), $pop7 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5 -; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.lt $push0=, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.select $push1=, $1, $5, $pop0 +; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 +; NO-SIMD128-FAST-NEXT: f32.lt $push2=, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.select $push3=, $2, $6, $pop2 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.lt $push4=, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $3, $7, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.lt $push6=, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.select $push7=, $4, $8, $pop6 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop7 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) ret <4 x float> %a @@ -11830,26 +11838,26 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32: ; NO-SIMD128: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8 +; NO-SIMD128-NEXT: f32.min $push0=, $4, $8 ; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7 +; NO-SIMD128-NEXT: f32.min $push1=, $3, $7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6 +; NO-SIMD128-NEXT: f32.min $push2=, $2, $6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5 +; NO-SIMD128-NEXT: f32.min $push3=, $1, $5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop3 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.min $push0=, $1, $5 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $2, $6 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $3, $7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.min $push3=, $4, $8 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 ; NO-SIMD128-FAST-NEXT: return %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -11875,16 +11883,16 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128: .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fminf, $4, $pop0 +; NO-SIMD128-NEXT: f32.min $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 ; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fminf, $3, $pop7 +; NO-SIMD128-NEXT: f32.min $push2=, $3, $pop7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push3=, fminf, $2, $pop6 +; NO-SIMD128-NEXT: f32.min $push3=, $2, $pop6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop3 ; NO-SIMD128-NEXT: f32.const $push5=, -0x1p0 -; NO-SIMD128-NEXT: call $push4=, fminf, $1, $pop5 +; NO-SIMD128-NEXT: f32.min $push4=, $1, $pop5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop4 ; NO-SIMD128-NEXT: return ; @@ -11892,16 +11900,16 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128-FAST: .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $2, $pop7 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $2, $pop7 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop2 ; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $3, $pop6 +; NO-SIMD128-FAST-NEXT: f32.min $push3=, $3, $pop6 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop3 ; NO-SIMD128-FAST-NEXT: f32.const $push5=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fminf, $4, $pop5 +; NO-SIMD128-FAST-NEXT: f32.min $push4=, $4, $pop5 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop4 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float>) @@ -11979,34 +11987,38 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128: .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fminf, $4, $pop0 +; NO-SIMD128-NEXT: f32.min $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 -; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fminf, $3, $pop7 +; NO-SIMD128-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-NEXT: f32.min $push2=, $3, $pop9 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x0p0 -; NO-SIMD128-NEXT: call $push4=, fminf, $2, $pop3 -; NO-SIMD128-NEXT: f32.store 4($0), $pop4 -; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fminf, $1, $pop6 -; NO-SIMD128-NEXT: f32.store 0($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-NEXT: f32.lt $push4=, $2, $pop8 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 +; NO-SIMD128-NEXT: f32.min $push6=, $1, $pop7 +; NO-SIMD128-NEXT: f32.store 0($0), $pop6 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic: ; NO-SIMD128-FAST: .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 -; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x0p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $2, $pop2 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $3, $pop9 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 +; NO-SIMD128-FAST-NEXT: f32.const $push3=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.lt $push4=, $2, $pop8 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop5 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fminf, $3, $pop7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 -; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fminf, $4, $pop6 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.min $push6=, $4, $pop7 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop6 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float>) ret <4 x float> %a @@ -12126,27 +12138,35 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32: ; NO-SIMD128: .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fmaxf, $4, $8 -; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $3, $7 -; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $2, $6 -; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fmaxf, $1, $5 -; NO-SIMD128-NEXT: f32.store 0($0), $pop3 +; NO-SIMD128-NEXT: f32.gt $push0=, $4, $8 +; NO-SIMD128-NEXT: f32.select $push1=, $4, $8, $pop0 +; NO-SIMD128-NEXT: f32.store 12($0), $pop1 +; NO-SIMD128-NEXT: f32.gt $push2=, $3, $7 +; NO-SIMD128-NEXT: f32.select $push3=, $3, $7, $pop2 +; NO-SIMD128-NEXT: f32.store 8($0), $pop3 +; NO-SIMD128-NEXT: f32.gt $push4=, $2, $6 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $6, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.gt $push6=, $1, $5 +; NO-SIMD128-NEXT: f32.select $push7=, $1, $5, $pop6 +; NO-SIMD128-NEXT: f32.store 0($0), $pop7 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fmaxf, $1, $5 -; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $2, $6 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fmaxf, $3, $7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $4, $8 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.gt $push0=, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.select $push1=, $1, $5, $pop0 +; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 +; NO-SIMD128-FAST-NEXT: f32.gt $push2=, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.select $push3=, $2, $6, $pop2 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.gt $push4=, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $3, $7, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.gt $push6=, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.select $push7=, $4, $8, $pop6 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop7 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) ret <4 x float> %a @@ -12168,26 +12188,26 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32: ; NO-SIMD128: .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fmaxf, $4, $8 +; NO-SIMD128-NEXT: f32.max $push0=, $4, $8 ; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $3, $7 +; NO-SIMD128-NEXT: f32.max $push1=, $3, $7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $2, $6 +; NO-SIMD128-NEXT: f32.max $push2=, $2, $6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fmaxf, $1, $5 +; NO-SIMD128-NEXT: f32.max $push3=, $1, $5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop3 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fmaxf, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.max $push0=, $1, $5 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $2, $6 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fmaxf, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.max $push2=, $3, $7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.max $push3=, $4, $8 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 ; NO-SIMD128-FAST-NEXT: return %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -12265,34 +12285,38 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128: .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $4, $pop0 +; NO-SIMD128-NEXT: f32.max $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 -; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $3, $pop7 +; NO-SIMD128-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-NEXT: f32.max $push2=, $3, $pop9 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x0p0 -; NO-SIMD128-NEXT: call $push4=, fmaxf, $2, $pop3 -; NO-SIMD128-NEXT: f32.store 4($0), $pop4 -; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fmaxf, $1, $pop6 -; NO-SIMD128-NEXT: f32.store 0($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-NEXT: f32.gt $push4=, $2, $pop8 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 +; NO-SIMD128-NEXT: f32.max $push6=, $1, $pop7 +; NO-SIMD128-NEXT: f32.store 0($0), $pop6 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 -; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x0p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $2, $pop2 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-FAST-NEXT: f32.max $push2=, $3, $pop9 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 +; NO-SIMD128-FAST-NEXT: f32.const $push3=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.gt $push4=, $2, $pop8 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop5 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fmaxf, $3, $pop7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 -; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fmaxf, $4, $pop6 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.max $push6=, $4, $pop7 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop6 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float>) ret <4 x float> %a @@ -12317,16 +12341,16 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128: .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $4, $pop0 +; NO-SIMD128-NEXT: f32.max $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 ; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $3, $pop7 +; NO-SIMD128-NEXT: f32.max $push2=, $3, $pop7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x1p0 -; NO-SIMD128-NEXT: call $push4=, fmaxf, $2, $pop3 +; NO-SIMD128-NEXT: f32.max $push4=, $2, $pop3 ; NO-SIMD128-NEXT: f32.store 4($0), $pop4 ; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fmaxf, $1, $pop6 +; NO-SIMD128-NEXT: f32.max $push5=, $1, $pop6 ; NO-SIMD128-NEXT: f32.store 0($0), $pop5 ; NO-SIMD128-NEXT: return ; @@ -12334,16 +12358,16 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128-FAST: .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 ; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x1p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $2, $pop2 +; NO-SIMD128-FAST-NEXT: f32.max $push3=, $2, $pop2 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fmaxf, $3, $pop7 +; NO-SIMD128-FAST-NEXT: f32.max $push4=, $3, $pop7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 ; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fmaxf, $4, $pop6 +; NO-SIMD128-FAST-NEXT: f32.max $push5=, $4, $pop6 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float>)