Skip to content

Commit

Permalink
[AMDGPU] Add target intrinsic for s_prefetch_data (llvm#107133)
Browse files Browse the repository at this point in the history
  • Loading branch information
rampitec committed Sep 5, 2024
1 parent 1e98aa4 commit bd840a4
Show file tree
Hide file tree
Showing 10 changed files with 230 additions and 3 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
}
case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
return emitBuiltinWithOneOverloadedType<2>(
*this, E, Intrinsic::amdgcn_s_prefetch_data);
default:
return nullptr;
}
Expand Down
26 changes: 25 additions & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
Original file line number Diff line number Diff line change
Expand Up @@ -256,4 +256,28 @@ void test_s_ttracedata_imm()
__builtin_amdgcn_s_ttracedata_imm(1);
}


// CHECK-LABEL: @test_s_prefetch_data(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
// CHECK-NEXT: ret void
//
void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
{
__builtin_amdgcn_s_prefetch_data(fp, 0);
__builtin_amdgcn_s_prefetch_data(gp, len);
__builtin_amdgcn_s_prefetch_data(cp, 31);
}
8 changes: 8 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_wave_id :
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

def int_amdgcn_s_prefetch_data :
Intrinsic<[],
[llvm_anyptr_ty, // Pointer to a constant/global memory
llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes)
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
MIB.addImm(MI.getOperand(OpIdx).getImm());
const MachineOperand &Op = MI.getOperand(OpIdx);
int64_t Imm;
if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
MIB.addImm(Imm);
else
MIB.addImm(Op.getImm());
}

void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
case Intrinsic::amdgcn_s_prefetch_data: {
Register PtrReg = MI.getOperand(1).getReg();
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
} else
MI.eraseFromParent();
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Expand Down Expand Up @@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
default:
return getInvalidInstructionMapping();
}
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_s_prefetch_data: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
Info.flags |= MachineMemOperand::MOLoad;
return true;
}
default:
return false;
}
Expand Down Expand Up @@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
case Intrinsic::amdgcn_s_prefetch_data: {
// For non-global address space preserve the chain and remove the call.
if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
return Op.getOperand(0);
return Op;
}
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
SBase->setReg(SGPR);
}
MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
SOff->setReg(SGPR);
}
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/SMInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
defm : SMPrefetchPat<"INST", i32imm_zero>;
defm : SMPrefetchPat<"DATA", i32imm_one>;

let SubtargetPredicate = isGFX12Plus in {
def : GCNPat <
(int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)),
(S_PREFETCH_DATA $sbase, $offset, $len, 0)
>;

def : GCNPat <
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)),
(S_PREFETCH_DATA $sbase, 0, $len, 0)
>;

def : GCNPat <
(int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len),
(S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len))
>;

def : GCNPat <
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
(S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))
>;
} // End let SubtargetPredicate = isGFX12Plus

//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
Expand Down
136 changes: 136 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s

define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0
; GCN-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: prefetch_data_sgpr_base_imm_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31
; GCN-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31)
ret void
}

define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_readfirstlane_b32 s2, v0
; GCN-NEXT: v_readfirstlane_b32 s3, v1
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
ret void
}

define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_readfirstlane_b32 s2, v0
; SDAG-NEXT: v_readfirstlane_b32 s3, v1
; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0
; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GISEL-NEXT: v_readfirstlane_b32 s2, v0
; GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
; GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) {
; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_readfirstlane_b32 s2, v0
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) {
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) {
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31)
ret void
}

define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) {
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31)
ret void
}

define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) {
; GCN-LABEL: prefetch_data_vgpr_base_imm_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v1
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0)
ret void
}

declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len)
declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len)

0 comments on commit bd840a4

Please sign in to comment.