Skip to content

Commit

Permalink
clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (ll…
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenm authored Aug 20, 2024
1 parent 0a22655 commit 5822cc2
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 24 deletions.
26 changes: 8 additions & 18 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18955,22 +18955,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
return Builder.CreateCall(F, {Addr, Val});
}
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
break;
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
break;
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
return Builder.CreateCall(F, {Addr, Val});
}
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
Expand Down Expand Up @@ -19352,7 +19336,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
Expand All @@ -19374,6 +19360,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
Expand Down Expand Up @@ -19418,7 +19406,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
AO = AtomicOrdering::Monotonic;

// The v2bf16 builtin uses i16 instead of a natural bfloat type.
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
llvm::Type *V2BF16Ty = FixedVectorType::get(
llvm::Type::getBFloatTy(Builder.getContext()), 2);
Val = Builder.CreateBitCast(Val, V2BF16Ty);
Expand Down
18 changes: 14 additions & 4 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;

// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_local_add_2bf16
Expand Down Expand Up @@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
}

// CHECK-LABEL: test_flat_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_flat_add_2bf16
// GFX12: flat_atomic_pk_add_bf16
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
Expand All @@ -84,7 +87,11 @@ void test_global_add_half2_noret(__global half2 *addr, half2 x) {
}

// CHECK-LABEL: test_global_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>


// GFX12-LABEL: test_global_add_2bf16
// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
void test_global_add_2bf16(__global short2 *addr, short2 x) {
Expand All @@ -93,7 +100,10 @@ void test_global_add_2bf16(__global short2 *addr, short2 x) {
}

// CHECK-LABEL: test_global_add_2bf16_noret
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_global_add_2bf16_noret
// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {
Expand Down
10 changes: 8 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,21 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
}

// CHECK-LABEL: test_flat_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX940-LABEL: test_flat_add_2bf16
// GFX940: flat_atomic_pk_add_bf16
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
}

// CHECK-LABEL: test_global_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX940-LABEL: test_global_add_2bf16
// GFX940: global_atomic_pk_add_bf16
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
Expand Down

0 comments on commit 5822cc2

Please sign in to comment.