From 0bcabaeef56b86790b730f0af6f0c3427e977dd5 Mon Sep 17 00:00:00 2001 From: mmoadeli Date: Thu, 4 Apr 2024 12:31:23 +0100 Subject: [PATCH] [SYCL][HIP] Update amd joint_matrix tests to reflect changes to joint_matrix_mad API. (#13250) - The` joint_matrix_mad` API has been modified to accept the output as an argument to the function. This pull request updates the relevant tests to accommodate this change for amd gpu. - Minor update to check joint_matrix parameters in compile time. --- .../hip/matrix/compile-query-hip-gfx90a.cpp | 20 +++++++------------ .../matrix/matrix-hip-bfloat16-float-test.cpp | 13 ++++-------- .../matrix/matrix-hip-double-double-test.cpp | 8 ++------ .../hip/matrix/matrix-hip-half-float-test.cpp | 12 ++++------- .../hip/matrix/matrix-hip-int8-int32-test.cpp | 10 +++------- 5 files changed, 20 insertions(+), 43 deletions(-) diff --git a/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp index 8e590b8d537d4..ede01163dc3db 100644 --- a/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp +++ b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp @@ -1,5 +1,4 @@ // REQUIRES: hip - // RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o compile-query-hip #include @@ -14,21 +13,16 @@ int main() { using myparams = matrix_params; - size_t dmsize = myparams::M; - size_t dnsize = myparams::N; - size_t dksize = myparams::K; - std::cout - << "sizes of AMD gpu gfx90a matrix_params chosen by the user are: M " - << dmsize << " N " << dnsize << " K " << dksize << std::endl; + static_assert(myparams::M == 32); + static_assert(myparams::N == 32); + static_assert(myparams::K == 8); // Sizes-only compile-time query: types are given, generate default sizes using myparams2 = matrix_params; - myparams2 p; - dmsize = myparams2::M; - dnsize = myparams2::N; - dksize = myparams2::K; - std::cout << "default AMD gpu gfx90a sizes matrix_params are: M " << dmsize - << " N " << dnsize << " K " << dksize << std::endl; + static_assert(myparams2::M == 16); + static_assert(myparams2::N == 16); + static_assert(myparams2::K == 4); + return 0; }; diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp index 310df8cbe07ce..29843ac50f114 100644 --- a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp @@ -1,6 +1,4 @@ // REQUIRES: hip -// XFAIL: hip - // RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s #include @@ -10,12 +8,10 @@ using namespace sycl::ext::oneapi::experimental::matrix; using sycl::ext::oneapi::bfloat16; int main() { - buffer bufA(nullptr, range<1>(1)); buffer bufB(nullptr, range<1>(1)); buffer bufC(nullptr, range<1>(1)); buffer bufD(nullptr, range<1>(1)); - queue q; q.submit([&](handler &cgh) { @@ -42,9 +38,8 @@ int main() { sub_a{}; joint_matrix sub_b{}; - - // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %{{.*}}, <4 x i16> %{{.*}} <4 x float> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 16, layout::row_major); @@ -61,8 +56,8 @@ int main() { joint_matrix sub_b{}; - // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> {{.*}}, <4 x i16> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 32, layout::row_major); diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp index 4516e90969ade..e82e6fd0337db 100644 --- a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp @@ -1,6 +1,4 @@ // REQUIRES: hip -// XFAIL: hip - // RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s #include @@ -9,12 +7,10 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; int main() { - buffer bufA(nullptr, range<1>(1)); buffer bufB(nullptr, range<1>(1)); buffer bufC(nullptr, range<1>(1)); buffer bufD(nullptr, range<1>(1)); - queue q; q.submit([&](handler &cgh) { @@ -42,8 +38,8 @@ int main() { joint_matrix sub_b{}; - // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %{{.*}}, double %{{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 16, layout::row_major); diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp index b7703a040207f..2afe666034bf5 100644 --- a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp @@ -1,6 +1,4 @@ // REQUIRES: hip -// XFAIL: hip - // RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s #include @@ -9,12 +7,10 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; int main() { - buffer bufA(nullptr, range<1>(1)); buffer bufB(nullptr, range<1>(1)); buffer bufC(nullptr, range<1>(1)); buffer bufD(nullptr, range<1>(1)); - queue q; q.submit([&](handler &cgh) { @@ -42,8 +38,8 @@ int main() { joint_matrix sub_b{}; - // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 16, layout::row_major); @@ -60,8 +56,8 @@ int main() { joint_matrix sub_b{}; - // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> {{.*}}, <4 x half> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 32, layout::row_major); diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp index dad30a221ac45..d39f7a8772717 100644 --- a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp @@ -1,6 +1,4 @@ // REQUIRES: hip -// XFAIL: hip - // RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s #include @@ -9,12 +7,10 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; int main() { - buffer bufA(nullptr, range<1>(1)); buffer bufB(nullptr, range<1>(1)); buffer bufC(nullptr, range<1>(1)); buffer bufD(nullptr, range<1>(1)); - queue q; q.submit([&](handler &cgh) { @@ -42,8 +38,8 @@ int main() { joint_matrix sub_b{}; - // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 {{.*}}, i32 {{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 16, layout::row_major); @@ -61,7 +57,7 @@ int main() { sub_b{}; // CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0) - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); joint_matrix_store( sg, sub_c, accD.template get_multi_ptr(), 32, layout::row_major);