Skip to content

Commit

Permalink
DAG: Pass flags to FoldConstantArithmetic (llvm#93663)
Browse files Browse the repository at this point in the history
There is simply way too much going on inside getNode. The complicated
constant folding of vector handling works by looking for build_vector
operands, and then tries to getNode the scalar element and then checks
if
constants were the result. As a side effect, this produces unused scalar
operation nodes (previously, without flags). If the vector operation
were later scalarized, it would find the flagless constant folding
temporary and lose the flag. I don't think this is a reasonable way for
constant folding to operate, but for now fix this by ensuring flags
on the original operation are preserved in the temporary.
    
This yields a clear code improvement for AMDGPU when f16 isn't legal.
The Wasm cases switch from using a libcall to compare and select. We are
evidently
missing the fcmp+select to fminimum/fmaximum handling, but this would be
further
improved when that's handled. AArch64 also avoids the libcall, but looks
worse and
has a different call for some reason.
  • Loading branch information
arsenm authored Jun 6, 2024
1 parent 083a266 commit 84b0266
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 194 deletions.
3 changes: 2 additions & 1 deletion llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1893,7 +1893,8 @@ class SelectionDAG {
const SDNode *N2);

SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops);
ArrayRef<SDValue> Ops,
SDNodeFlags Flags = SDNodeFlags());

/// Fold floating-point operations when all operands are constants and/or
/// undefined.
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6333,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
}

SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
EVT VT, ArrayRef<SDValue> Ops) {
EVT VT, ArrayRef<SDValue> Ops,
SDNodeFlags Flags) {
// If the opcode is a target-specific ISD node, there's nothing we can
// do here and the operand rules may not line up with the below, so
// bail early.
Expand Down Expand Up @@ -6690,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}

// Constant fold the scalar operands.
SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);

// Legalize the (integer) scalar constant if necessary.
if (LegalSVT != SVT)
Expand Down Expand Up @@ -7261,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}

// Perform trivial constant folding.
if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
return SV;

// Canonicalize an UNDEF to the RHS, even over a constant.
Expand Down
14 changes: 13 additions & 1 deletion llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: // %bb.0:
; CHECK-NEXT: b fmaxl
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK-NEXT: bl __gttf2
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: b.le .LBB18_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .LBB18_2:
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
%b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
ret fp128 %b
}
14 changes: 13 additions & 1 deletion llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: // %bb.0:
; CHECK-NEXT: b fminl
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: b.ge .LBB18_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .LBB18_2:
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
%b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
ret fp128 %b
}
138 changes: 48 additions & 90 deletions llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -654,21 +654,16 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX7-LABEL: v_maximum_v2f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v2, v1, v3
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v2f16__nnan:
Expand Down Expand Up @@ -847,21 +842,16 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX7-LABEL: v_maximum_v2f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v2, v1, v3
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
Expand Down Expand Up @@ -1216,28 +1206,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX7-LABEL: v_maximum_v3f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_max_f32_e32 v6, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
; GFX7-NEXT: v_max_f32_e32 v3, v1, v4
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; GFX7-NEXT: v_max_f32_e32 v3, v2, v5
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v3f16__nnan:
Expand Down Expand Up @@ -1427,28 +1410,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX7-LABEL: v_maximum_v3f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_max_f32_e32 v6, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
; GFX7-NEXT: v_max_f32_e32 v3, v1, v4
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; GFX7-NEXT: v_max_f32_e32 v3, v2, v5
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v3f16__nnan_nsz:
Expand Down Expand Up @@ -1671,35 +1647,26 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX7-LABEL: v_maximum_v4f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_max_f32_e32 v8, v0, v4
; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v1, v5
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v2, v6
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v3, v7
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v4f16__nnan:
Expand Down Expand Up @@ -1924,35 +1891,26 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX7-LABEL: v_maximum_v4f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_max_f32_e32 v8, v0, v4
; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v1, v5
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v2, v6
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v4, v3, v7
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v4f16__nnan_nsz:
Expand Down
Loading

0 comments on commit 84b0266

Please sign in to comment.