diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 7c93c19a410e43..539410f1ed05e6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2996,10 +2996,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic { def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic; } -// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; - defset list AMDGPUMFMAIntrinsics940 = { def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e24d119b781628..c6963edf5288ae 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || - Name.starts_with("ds.fmax")) { + Name.starts_with("ds.fmax") || + Name.starts_with("global.atomic.fadd.v2bf16") || + Name.starts_with("flat.atomic.fadd.v2bf16")) { // Replaced with atomicrmw fadd/fmin/fmax, so there's no new // declaration. NewFn = nullptr; @@ -4042,7 +4044,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, .StartsWith("ds.fmin", AtomicRMWInst::FMin) .StartsWith("ds.fmax", AtomicRMWInst::FMax) .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap) - .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap); + .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap) + .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd) + .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd); unsigned NumOperands = CI->getNumOperands(); if (NumOperands < 3) // Malformed bitcode. @@ -4097,8 +4101,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID); if (PtrTy->getAddressSpace() != 3) { - RMW->setMetadata("amdgpu.no.fine.grained.memory", - MDNode::get(F->getContext(), {})); + MDNode *EmptyMD = MDNode::get(F->getContext(), {}); + RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); + if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy()) + RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); } if (IsVolatile) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index c6dbc58395e48f..db8b44149cf47e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op { defm int_amdgcn_flat_atomic_fadd : noret_op; defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; -defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; defm int_amdgcn_flat_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; -defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5add368c05646a..12aa6ee2a2536a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4907,8 +4907,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 5c4d2b8d030e1d..48fb786ed97206 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -250,8 +250,6 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 351563657aeb55..8067090636a9aa 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1674,13 +1674,11 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ae55b56fbf43fb..d02d0bbb52e567 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1362,9 +1362,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_atomic_cond_sub_u32: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { + case Intrinsic::amdgcn_atomic_cond_sub_u32: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1467,14 +1465,12 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_flat_atomic_fadd: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fadd: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin: diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index a114c27bafd4a2..9563d178e64330 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -300,4 +300,26 @@ define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float ret float %result0 } +declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr, <2 x i16>) + +define <2 x i16> @upgrade_amdgcn_flat_atomic_fadd_v2bf16_p0(ptr %ptr, <2 x i16> %data) { + ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat> + ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16> + ; CHECK-NEXT: ret <2 x i16> [[BC1]] + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) + ret <2 x i16> %result +} + +declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1), <2 x i16>) + +define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %ptr, <2 x i16> %data) { + ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat> + ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16> + ; CHECK-NEXT: ret <2 x i16> [[BC1]] + %result = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) + ret <2 x i16> %result +} + attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 085da8bc4f8d99..031a3633bd3757 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -4,12 +4,6 @@ declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) -; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: @@ -106,106 +100,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 1914b74be1909b..05259b4f51310d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -7,127 +7,7 @@ declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: @@ -177,104 +57,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16: ; GFX12-SDAG: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 1be934d517ef71..5322a283d3de4d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -4,10 +4,6 @@ declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) @@ -186,97 +182,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -573,104 +478,6 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha ret void } -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } !0 = !{}