Skip to content

Commit

Permalink
[AMDGPU] Avoid unneeded waitcounts before spill stores (llvm#108303)
Browse files Browse the repository at this point in the history
Implicit defs and uses on spill stores were accounted as real defs and
uses, while only exist for liveness accounting. As a result unneded
waits were generated.

Fixes: SWDEV-484177
  • Loading branch information
rampitec committed Sep 14, 2024
1 parent 918972b commit 18f1c98
Show file tree
Hide file tree
Showing 19 changed files with 566 additions and 729 deletions.
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,16 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
//
// Check only explicit operands. Stores, especially spill stores, include
// implicit uses and defs of their super registers which would create an
// artificial dependency, while these are there only for register liveness
// accounting purposes.
//
// Special cases where implicit register defs and uses exists, such as
// M0, FLAT_SCR or VCC, but the wait will be generated earlier in the
// generateWaitcntInstBefore() if that was loaded from memory.
for (unsigned I = 0, E = Inst.getNumExplicitOperands(); I != E; ++I) {
auto &Op = Inst.getOperand(I);
if (!Op.isReg() || !Op.isDef())
continue;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
Expand All @@ -294,7 +294,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
Expand All @@ -317,7 +317,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
Expand Down Expand Up @@ -215,7 +215,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
Expand Down Expand Up @@ -1093,7 +1092,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1124,7 +1123,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse
Expand Down
8 changes: 1 addition & 7 deletions llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
Expand All @@ -722,6 +721,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
Expand Down Expand Up @@ -1159,7 +1159,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1193,7 +1192,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1225,7 +1223,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
Expand All @@ -1247,7 +1244,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
Expand All @@ -1269,7 +1265,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1343,7 +1338,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
Expand Down
Loading

0 comments on commit 18f1c98

Please sign in to comment.