From 39d15a7d3bc6bca4c2ad0fc432ba757eb9b8338c Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 1 Dec 2023 07:34:22 -0800 Subject: [PATCH] [AArch64][SME] Remove implicit-def's on smstart (#69012) When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway. --- .../Target/AArch64/AArch64ISelLowering.cpp | 16 ++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 ++ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 1 + .../sme-streaming-compatible-interface.ll | 53 +++++++++++++++++++ .../AArch64/sme-streaming-interface.ll | 41 +++++++++++--- 6 files changed, 109 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 68fa58dea5beb1..011aedeba1eb79 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7398,6 +7398,22 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { return ZExtBool; } +void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const { + // Live-in physreg copies that are glued to SMSTART are applied as + // implicit-def's in the InstrEmitter. Here we remove them, allowing the + // register allocator to pass call args in callee saved regs, without extra + // copies to avoid these fake clobbers of actually-preserved GPRs. + if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 || + MI.getOpcode() == AArch64::MSRpstatePseudo) + for (unsigned I = MI.getNumOperands() - 1; I > 0; --I) + if (MachineOperand &MO = MI.getOperand(I); + MO.isReg() && MO.isImplicit() && MO.isDef() && + (AArch64::GPR32RegClass.contains(MO.getReg()) || + AArch64::GPR64RegClass.contains(MO.getReg()))) + MI.removeOperand(I); +} + SDValue AArch64TargetLowering::changeStreamingMode( SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2b16d2471770d0..e6d62f1704726b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -999,6 +999,9 @@ class AArch64TargetLowering : public TargetLowering { const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; + void AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const override; + SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl &InVals) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 84ec88d4fd49b6..a58799116003dd 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -203,7 +203,9 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), def MSRpstatePseudo : Pseudo<(outs), (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>, - Sched<[WriteSys]>; + Sched<[WriteSys]> { + let hasPostISelHook = 1; +} def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index c13c1b4e81faad..408b897070af0b 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -222,6 +222,7 @@ def MSRpstatesvcrImm1 let Inst{11-9} = pstatefield; let Inst{8} = imm; let Inst{7-5} = 0b011; // op2 + let hasPostISelHook = 1; } def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 1ad6b189d6fa5c..5d0c9127d3ebb2 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -436,3 +436,56 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { tail call void @normal_callee(); ret void; } + +define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: call_to_non_streaming_pass_args: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x1 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz w19, #0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB10_2: // %entry +; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov x1, x8 +; CHECK-NEXT: bl bar +; CHECK-NEXT: tbz w19, #0, .LBB10_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB10_4: // %entry +; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret +entry: + call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) + ret void +} + +declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 102ed896ce7b3e..dd7d6470ad7b08 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: rdsvl x3, #1 +; CHECK-NEXT: addvl x0, sp, #2 +; CHECK-NEXT: addvl x1, sp, #1 +; CHECK-NEXT: mov x2, sp ; CHECK-NEXT: smstop sm -; CHECK-NEXT: mov x0, x9 -; CHECK-NEXT: mov x1, x10 -; CHECK-NEXT: mov x2, x11 -; CHECK-NEXT: mov x3, x8 ; CHECK-NEXT: bl foo ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ptrue p0.b @@ -400,8 +396,37 @@ entry: ret i8 %vecext } +define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 { +; CHECK-LABEL: call_to_non_streaming_pass_args: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: bl bar +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret +entry: + call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) + ret void +} + declare i64 @llvm.aarch64.sme.cntsb() declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef) +declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef) attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }