Skip to content

Commit

Permalink
[SYCL] Change check_device_code HIP tests to use SYCL_EXTERNAL (#13990)
Browse files Browse the repository at this point in the history
Changed the HIP sycl/test/check_device_code lit tests to use
SYCL_EXTERNAL functions instead of writing entire programs.
  • Loading branch information
ianayl committed Jun 10, 2024
1 parent 5e269c8 commit 849299f
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 262 deletions.
74 changes: 33 additions & 41 deletions sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,37 @@

#include <sycl/sycl.hpp>

class intKernel;
class fpKernel;

int main() {
int *i;
float *f;
double *d;
sycl::queue{}.single_task<intKernel>([=] {
sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
atomicInt(*i);
atomicInt.fetch_xor(1);
atomicInt.fetch_and(1);
atomicInt.fetch_or(1);
// CHECK: amdgpu_kernel void{{.*}}intKernel
// CHECK-SAFE: cmpxchg volatile
// CHECK-SAFE-NOT: atomicrmw
// CHECK-UNSAFE: atomicrmw volatile xor
// CHECK-UNSAFE: atomicrmw volatile and
// CHECK-UNSAFE: atomicrmw volatile or
// CHECK-UNSAFE-NOT: cmpxchg
});
sycl::queue{}.single_task<fpKernel>([=] {
sycl::atomic_ref<float, sycl::memory_order_relaxed,
sycl::memory_scope_device,
sycl::access::address_space::global_space>(*f)
.fetch_add(1.0f);
// CHECK: amdgpu_kernel void{{.*}}fpKernel
// CHECK-SAFE: atomicrmw volatile fadd
// CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f32
// CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f32
// CHECK-UNSAFE-FP-NOT: atomicrmw volatile fadd
sycl::atomic_ref<double, sycl::memory_order_relaxed,
sycl::memory_scope_device,
sycl::access::address_space::global_space>(*d)
.fetch_add(1.0);
// CHECK-SAFE: cmpxchg
// CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f64
// CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f64
// CHECK-UNSAFE-FP-NOT: cmpxchg
// CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
});
SYCL_EXTERNAL void intAtomicFunc(int *i) {
sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
atomicInt(*i);
atomicInt.fetch_xor(1);
atomicInt.fetch_and(1);
atomicInt.fetch_or(1);
// CHECK: void{{.*}}intAtomicFunc
// CHECK-SAFE: cmpxchg volatile
// CHECK-SAFE-NOT: atomicrmw
// CHECK-UNSAFE: atomicrmw volatile xor
// CHECK-UNSAFE: atomicrmw volatile and
// CHECK-UNSAFE: atomicrmw volatile or
// CHECK-UNSAFE-NOT: cmpxchg
}

SYCL_EXTERNAL void fpAtomicFunc(float *f, double *d) {
sycl::atomic_ref<float, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::access::address_space::global_space>(*f)
.fetch_add(1.0f);
// CHECK: void{{.*}}fpAtomicFunc
// CHECK-SAFE: atomicrmw volatile fadd
// CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f32
// CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f32
// CHECK-UNSAFE-FP-NOT: atomicrmw volatile fadd
sycl::atomic_ref<double, sycl::memory_order_relaxed,
sycl::memory_scope_device,
sycl::access::address_space::global_space>(*d)
.fetch_add(1.0);
// CHECK-SAFE: cmpxchg
// CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f64
// CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f64
// CHECK-UNSAFE-FP-NOT: cmpxchg
// CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,55 @@ using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;
using sycl::ext::oneapi::bfloat16;

int main() {
buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
buffer<float, 1> bufC(nullptr, range<1>(1));
buffer<float, 1> bufD(nullptr, range<1>(1));
queue q;

q.submit([&](handler &cgh) {
sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accA(bufA, cgh);
sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accB(bufB, cgh);
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC(bufC, cgh);
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD(bufD, cgh);

cgh.parallel_for<class row_row_m16n16k16>(
nd_range<2>({1, 64}, {1, 64}),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major>
sub_a{};
joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
sub_b{};
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
16, layout::row_major);
});

cgh.parallel_for<class row_col_m32n32k8>(
nd_range<2>({1, 64}, {1, 64}),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
joint_matrix<sub_group, bfloat16, use::a, 32, 8, layout::row_major>
sub_a{};
joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major>
sub_b{};

// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
32, layout::row_major);
});
});

return 0;
};
SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
row_row_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accA,
sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accB,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD,
nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major> sub_a{};
joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major> sub_b{};
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(sg, sub_c,
accD.template get_multi_ptr<access::decorated::yes>(), 16,
layout::row_major);
}

SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
row_col_m32n32k8(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accA,
sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
sycl::target::device>
accB,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD,
nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
joint_matrix<sub_group, bfloat16, use::a, 32, 8, layout::row_major> sub_a{};
joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major> sub_b{};

// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(sg, sub_c,
accD.template get_multi_ptr<access::decorated::yes>(), 32,
layout::row_major);
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,29 @@
using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;

int main() {
buffer<double, 1> bufA(nullptr, range<1>(1));
buffer<double, 1> bufB(nullptr, range<1>(1));
buffer<double, 1> bufC(nullptr, range<1>(1));
buffer<double, 1> bufD(nullptr, range<1>(1));
queue q;

q.submit([&](handler &cgh) {
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accA(bufA, cgh);
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accB(bufB, cgh);
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accC(bufC, cgh);
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accD(bufD, cgh);

cgh.parallel_for<class row_row_m16n16k4>(
nd_range<2>({1, 64}, {1, 64}),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, double, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, double, use::a, 16, 4, layout::row_major>
sub_a{};
joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major>
sub_b{};

// CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
16, layout::row_major);
});
});

return 0;
};
SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
row_row_m16n16k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accA,
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accB,
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accC,
sycl::accessor<double, 1, sycl::access::mode::read_write,
sycl::target::device>
accD,
nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, double, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, double, use::a, 16, 4, layout::row_major> sub_a{};
joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major> sub_b{};

// CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(sg, sub_c,
accD.template get_multi_ptr<access::decorated::yes>(), 16,
layout::row_major);
}
113 changes: 53 additions & 60 deletions sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,63 +6,56 @@
using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;

int main() {
buffer<half, 1> bufA(nullptr, range<1>(1));
buffer<half, 1> bufB(nullptr, range<1>(1));
buffer<float, 1> bufC(nullptr, range<1>(1));
buffer<float, 1> bufD(nullptr, range<1>(1));
queue q;

q.submit([&](handler &cgh) {
sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accA(bufA, cgh);
sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accB(bufB, cgh);
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC(bufC, cgh);
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD(bufD, cgh);

cgh.parallel_for<class row_row_m16n16k16>(
nd_range<2>({1, 64}, {1, 64}),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
sub_a{};
joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
sub_b{};

// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
16, layout::row_major);
});

cgh.parallel_for<class row_col_m32n32k8>(
nd_range<2>({1, 64}, {1, 64}),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
joint_matrix<sub_group, half, use::a, 32, 8, layout::row_major>
sub_a{};
joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major>
sub_b{};

// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
32, layout::row_major);
});
});

return 0;
};
SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accA,
sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accB,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD,
nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};

// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(sg, sub_c,
accD.template get_multi_ptr<access::decorated::yes>(), 16,
layout::row_major);
}

SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
row_col_m32n32k8(sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accA,
sycl::accessor<half, 1, sycl::access::mode::read_write,
sycl::target::device>
accB,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accC,
sycl::accessor<float, 1, sycl::access::mode::read_write,
sycl::target::device>
accD,
nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();

joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
joint_matrix<sub_group, half, use::a, 32, 8, layout::row_major> sub_a{};
joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major> sub_b{};

// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
joint_matrix_store(sg, sub_c,
accD.template get_multi_ptr<access::decorated::yes>(), 32,
layout::row_major);
}
Loading

0 comments on commit 849299f

Please sign in to comment.