From bd840a40042c2c67f56079493d0bcdbfc70325ba Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 5 Sep 2024 15:14:31 -0700 Subject: [PATCH] [AMDGPU] Add target intrinsic for s_prefetch_data (#107133) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 3 + .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 26 +++- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++ .../AMDGPU/AMDGPUInstructionSelector.cpp | 7 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 15 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SMInstructions.td | 22 +++ .../AMDGPU/llvm.amdgcn.s.prefetch.data.ll | 136 ++++++++++++++++++ 10 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ab29ef38f7792f..5060647d357641 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 02d8726baa4210..da7a1a55da5313 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); } + case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: + return emitBuiltinWithOneOverloadedType<2>( + *this, E, Intrinsic::amdgcn_s_prefetch_data); default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index d9ec258e644c9d..34ee44afe0f104 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -256,4 +256,28 @@ void test_s_ttracedata_imm() __builtin_amdgcn_s_ttracedata_imm(1); } - +// CHECK-LABEL: @test_s_prefetch_data( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5) +// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31) +// CHECK-NEXT: ret void +// +void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len) +{ + __builtin_amdgcn_s_prefetch_data(fp, 0); + __builtin_amdgcn_s_prefetch_data(gp, len); + __builtin_amdgcn_s_prefetch_data(cp, 31); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index dc13a35c66f9ab..a5259ba9eec36e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic; def int_amdgcn_wave_id : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef, IntrNoMem, IntrSpeculatable]>; +def int_amdgcn_s_prefetch_data : + Intrinsic<[], + [llvm_anyptr_ty, // Pointer to a constant/global memory + llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes) + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] + >; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3fcb364fc2c536..9bebd418bb426e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { - MIB.addImm(MI.getOperand(OpIdx).getImm()); + const MachineOperand &Op = MI.getOperand(OpIdx); + int64_t Imm; + if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm))) + MIB.addImm(Imm); + else + MIB.addImm(Op.getImm()); } void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 4737a322c255f4..a2e6842b760f65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); return; } + case Intrinsic::amdgcn_s_prefetch_data: { + Register PtrReg = MI.getOperand(1).getReg(); + unsigned AS = MRI.getType(PtrReg).getAddressSpace(); + if (AMDGPU::isFlatGlobalAddrSpace(AS)) { + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + } else + MI.eraseFromParent(); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_pops_exiting_wave_id: return getDefaultMappingSOP(MI); + case Intrinsic::amdgcn_s_prefetch_data: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 81b52935ddf397..accc3084217f2b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_s_prefetch_data: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } default: return false; } @@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + case Intrinsic::amdgcn_s_prefetch_data: { + // For non-global address space preserve the chain and remove the call. + if (!AMDGPU::isFlatGlobalAddrSpace(cast(Op)->getAddressSpace())) + return Op.getOperand(0); + return Op; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 844f62abc26717..90e11df500bc9b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, SBase->setReg(SGPR); } MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); - if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) { Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); SOff->setReg(SGPR); } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 9fc570bb85f24e..e7db4f49d9e549 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat { defm : SMPrefetchPat<"INST", i32imm_zero>; defm : SMPrefetchPat<"DATA", i32imm_one>; +let SubtargetPredicate = isGFX12Plus in { + def : GCNPat < + (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)), + (S_PREFETCH_DATA $sbase, $offset, $len, 0) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)), + (S_PREFETCH_DATA $sbase, 0, $len, 0) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len), + (S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len)) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len), + (S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len)) + >; +} // End let SubtargetPredicate = isGFX12Plus + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll new file mode 100644 index 00000000000000..54c39d78adb583 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0 +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0 +; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) { +; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) { +; GCN-LABEL: prefetch_data_vgpr_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0) + ret void +} + +declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) +declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len) +declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len)