Skip to content

Commit

Permalink
GFX12: Add LoopDataPrefetchPass (llvm#75625)
Browse files Browse the repository at this point in the history
It is currently disabled by default. It will need experiments on a real
HW to tune and decide on the profitability.

---------

Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
  • Loading branch information
mariusz-sikora-at-amd and rampitec authored Dec 19, 2023
1 parent e8d98fa commit a018c8c
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 1 deletion.
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,11 @@ static cl::opt<bool> EnableImageIntrinsicOptimizer(
cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
cl::Hidden);

static cl::opt<bool>
EnableLoopPrefetch("amdgpu-loop-prefetch",
cl::desc("Enable loop data prefetch on AMDGPU"),
cl::Hidden, cl::init(false));

static cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
Expand Down Expand Up @@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}

void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
addPass(createLoopDataPrefetchPass());
addPass(createSeparateConstOffsetFromGEPPass());
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1345,3 +1345,11 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
Cost.first += (Size + 255) / 256;
return Cost;
}

unsigned GCNTTIImpl::getPrefetchDistance() const {
return ST->hasPrefetch() ? 128 : 0;
}

bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
return AMDGPU::isFlatGlobalAddrSpace(AS);
}
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind);

/// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
unsigned getCacheLineSize() const override { return 128; }

/// How much before a load we should place the prefetch instruction.
/// This is currently measured in number of IR instructions.
unsigned getPrefetchDistance() const override;

/// \return if target want to issue a prefetch in address space \p AS.
bool shouldPrefetchAddressSpace(unsigned AS) const override;
};

} // end namespace llvm
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
return false;

// A mayLoad instruction without a def is not a load. Likely a prefetch.
if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
return false;

if (isDS(Opc0) && isDS(Opc1)) {

// FIXME: Handle this case:
Expand Down
8 changes: 7 additions & 1 deletion llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1 %s
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \
; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \
; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -amdgpu-loop-prefetch -debug-pass=Structure < %s 2>&1 \
; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1-OPTS %s
; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O2 %s
Expand Down Expand Up @@ -461,6 +461,12 @@
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
; GCN-O1-OPTS-NEXT: Loop Data Prefetch
; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
; GCN-O1-OPTS-NEXT: Straight line strength reduction
Expand Down
185 changes: 185 additions & 0 deletions llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s

define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_flat:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
; GCN-NEXT: .p2align 6
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GCN-NEXT: s_add_co_i32 s4, s4, -1
; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_b128 v[4:5], v[0:3]
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: .LBB0_3: ; %for.end
; GCN-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body

for.body: ; preds = %entry, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%idxprom = zext i32 %i.07 to i64
%arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
%ld = load <4 x i32>, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
store <4 x i32> %ld, ptr %arrayidx2, align 4
%inc = add nuw i32 %i.07, 1
%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
ret void
}

define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_global:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cbranch_scc1 .LBB1_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
; GCN-NEXT: .LBB1_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
; GCN-NEXT: s_add_co_i32 s4, s4, -1
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: .LBB1_3: ; %for.end
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body

for.body: ; preds = %entry, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%idxprom = zext i32 %i.07 to i64
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
%ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
%inc = add nuw i32 %i.07, 1
%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
ret void
}

define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_constant:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: .LBB2_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
; GCN-NEXT: s_add_co_i32 s4, s4, -1
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
; GCN-NEXT: .LBB2_3: ; %for.end
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body

for.body: ; preds = %entry, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%idxprom = zext i32 %i.07 to i64
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
%ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
%inc = add nuw i32 %i.07, 1
%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
ret void
}

define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_local:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: s_add_co_i32 s0, s0, 16
; GCN-NEXT: s_add_co_i32 s1, s1, 16
; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(1)
; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
; GCN-NEXT: s_waitcnt lgkmcnt(1)
; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
; GCN-NEXT: s_cbranch_scc1 .LBB3_1
; GCN-NEXT: .LBB3_2: ; %for.end
; GCN-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body

for.body: ; preds = %entry, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%idxprom = zext i32 %i.07 to i64
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
%ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
%inc = add nuw i32 %i.07, 1
%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
ret void
}

0 comments on commit a018c8c

Please sign in to comment.