Skip to content

Commit

Permalink
[AMDGPU] Support wide register or subregister access when emitting s_…
Browse files Browse the repository at this point in the history
…singleuse_vdst instructions. (llvm#88520)

Both single use producer and consumer instructions using wide/sub
registers are now correctly tracked and eligible for being marked as
single use.
  • Loading branch information
ScottEgerton authored Apr 18, 2024
1 parent d3993ac commit af0b69f
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 8 deletions.
25 changes: 19 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,19 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
bool InstructionEmitted = false;

for (MachineBasicBlock &MBB : MF) {
DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
DenseMap<MCRegUnit, unsigned> RegisterUseCount;

// Handle boundaries at the end of basic block separately to avoid
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
for (const auto &Liveouts : MBB.liveouts())
RegisterUseCount[Liveouts.PhysReg] = 2;
for (const auto &Liveout : MBB.liveouts()) {
for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
++Units) {
const auto [Unit, Mask] = *Units;
if ((Mask & Liveout.LaneMask).any())
RegisterUseCount[Unit] = 2;
}
}

for (MachineInstr &MI : reverse(MBB.instrs())) {
// All registers in all operands need to be single use for an
Expand All @@ -84,7 +90,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {

// Count the number of times each register is read.
if (Operand.readsReg())
RegisterUseCount[Reg]++;
for (const MCRegUnit &Unit : TRI->regunits(Reg))
RegisterUseCount[Unit]++;

// Do not attempt to optimise across exec mask changes.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
Expand All @@ -96,10 +103,16 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// check if the operands are single use.
if (!MI.modifiesRegister(Reg, TRI))
continue;
if (RegisterUseCount[Reg] > 1)

const auto RegUnits = TRI->regunits(Reg);
if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) {
return RegisterUseCount[Unit] > 1;
}))
AllProducerOperandsAreSingleUse = false;

// Reset uses count when a register is no longer live.
RegisterUseCount.erase(Reg);
for (const MCRegUnit &Unit : RegUnits)
RegisterUseCount.erase(Unit);
}
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
// TODO: Replace with candidate logging for instruction grouping
Expand Down
102 changes: 100 additions & 2 deletions llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
Original file line number Diff line number Diff line change
Expand Up @@ -521,9 +521,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
Expand Down Expand Up @@ -582,6 +580,31 @@ body: |
liveins: $vgpr1
...

# Write low 16-bits and then read 32-bit vgpr twice.
---
name: write_lo_read_full_twice
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: write_lo_read_full_twice
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
bb.0:
liveins: $vgpr0
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
bb.1:
liveins: $vgpr1, $vgpr2
...

# Write high 16-bits and then read 32-bit vgpr.
---
name: write_hi_read_full
Expand All @@ -605,3 +628,78 @@ body: |
bb.1:
liveins: $vgpr1
...

# Write high 16-bits and then read 32-bit vgpr twice.
---
name: write_hi_read_full_twice
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: write_hi_read_full_twice
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
bb.0:
liveins: $vgpr0
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
bb.1:
liveins: $vgpr1, $vgpr2
...

# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
---
name: write_both_read_full
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: write_both_read_full
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1
bb.0:
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
bb.1:
liveins: $vgpr1
...

# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
---
name: write_both_read_full_twice
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: write_both_read_full_twice
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
bb.0:
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
bb.1:
liveins: $vgpr1, $vgpr2
...

0 comments on commit af0b69f

Please sign in to comment.