From ed0619b4caa24af8e78053ecef2e5e808e0e2b08 Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Tue, 23 Apr 2024 23:46:01 -0700 Subject: [PATCH] [SYCL][Joint Matrix] Support 1x64x16 bf16 combination (#13391) - add support in device_info - add support in tests --- sycl/source/detail/device_info.hpp | 2 + .../SG32/joint_matrix_bfloat16_32x64x16.cpp | 26 ----------- .../SG32/joint_matrix_bfloat16_32x64x32.cpp | 26 ----------- ....cpp => joint_matrix_bfloat16_packedB.cpp} | 9 +--- .../Matrix/joint_matrix_bfloat16_32x64x16.cpp | 26 ----------- .../Matrix/joint_matrix_bfloat16_32x64x32.cpp | 25 ----------- ....cpp => joint_matrix_bfloat16_packedB.cpp} | 9 +--- .../joint_matrix_bfloat16_packedB_impl.hpp | 43 +++++++++++++++++-- sycl/test-e2e/Matrix/runtime_query_pvc.cpp | 2 + 9 files changed, 45 insertions(+), 123 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x16.cpp delete mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x32.cpp rename sycl/test-e2e/Matrix/SG32/{joint_matrix_bfloat16_16x16x16.cpp => joint_matrix_bfloat16_packedB.cpp} (68%) delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x32.cpp rename sycl/test-e2e/Matrix/{joint_matrix_bfloat16_16x16x16.cpp => joint_matrix_bfloat16_packedB.cpp} (67%) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 807887053446f..8acef8a4511d3 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -806,6 +806,8 @@ struct get_device_info_impl< matrix_type::fp32, matrix_type::fp32}, {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32, diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x16.cpp deleted file mode 100644 index dcb3477103e15..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x16.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//==----- joint_matrix_bfloat16_32x64x16.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// XFAIL: * - -#include "../common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 32 -constexpr size_t TM = 32; -constexpr size_t TN = 64; -constexpr size_t TK = 16; - -#include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x32.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x32.cpp deleted file mode 100644 index 8c0afa88b045e..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_32x64x32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//==----- joint_matrix_bfloat16_32x64x32.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// XFAIL: * - -#include "../common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 32 -constexpr size_t TM = 32; -constexpr size_t TN = 64; -constexpr size_t TK = 32; - -#include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp similarity index 68% rename from sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp rename to sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp index c5ae61395cf8f..d2ba1cdbdd38e 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -1,4 +1,4 @@ -//==----- joint_matrix_bfloat16_16x16x16.cpp - DPC++ joint_matrix----------==// +//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,12 +15,5 @@ #include "../common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - #define SG_SZ 32 -constexpr size_t TM = 16; -constexpr size_t TN = 16; -constexpr size_t TK = 16; - #include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp deleted file mode 100644 index c0dc58ec07492..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//==----- joint_matrix_bfloat16_32x64x16.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix -// REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// XFAIL: cpu - -#include "common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 -constexpr size_t TM = 32; -constexpr size_t TN = 64; -constexpr size_t TK = 16; - -#include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x32.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x32.cpp deleted file mode 100644 index a7746ab9cca17..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x32.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//==----- joint_matrix_bfloat16_32x64x32.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// XFAIL: * - -#include "common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 -constexpr size_t TM = 32; -constexpr size_t TN = 64; -constexpr size_t TK = 32; - -#include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp similarity index 67% rename from sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp rename to sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp index a8de2ff40546d..3e80168752545 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp @@ -1,4 +1,4 @@ -//==----- joint_matrix_bfloat16_16x16x16.cpp - DPC++ joint_matrix----------==// +//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,12 +13,5 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - #define SG_SZ 16 -constexpr size_t TM = 16; -constexpr size_t TN = 16; -constexpr size_t TK = 16; - #include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp index 772a74417b3fb..91156c3fcc128 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -1,4 +1,13 @@ -template +//=----- joint_matrix_bfloat16_packedB_impl.hpp - DPC++ joint_matrix -------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//=-------------------------------------------------------------------------=// + +template void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; @@ -13,7 +22,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - cgh.parallel_for( + cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] @@ -63,7 +72,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, }).wait(); } -int main() { +template int test() { static constexpr size_t MATRIX_M = TM * 2; static constexpr size_t MATRIX_N = TN * 2; static constexpr size_t MATRIX_K = TK * 2; @@ -83,12 +92,38 @@ int main() { big_matrix MD((float *)&D); big_matrix MA((bfloat16 *)&A); big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); + matrix_multiply(MC, MA, MB); matrix_multiply_ref( (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, MATRIX_K / 2); bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); + std::cout << TM << "x" << TN << "x" << TK << " "; std::cout << (res ? "passed" : "failed") << std::endl; return !res; } + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + int ret = 0; + for (auto &combination : combinations) { + if (combination.nsize == 0) { // Intel AMX + ret += test<16, 16, 16, class amx16x16x16>(); + break; + } + + if (combination.nsize == 16) { // architecture::intel_gpu_pvc + ret += test<16, 16, 16, class pvc16x16x16>(); + ret += test<32, 64, 16, class pvc32x64x16>(); + ret += test<1, 64, 16, class pvc1x64x16>(); + break; + } + } + + return ret; +} diff --git a/sycl/test-e2e/Matrix/runtime_query_pvc.cpp b/sycl/test-e2e/Matrix/runtime_query_pvc.cpp index 027a2ad7bdc6c..a1f150c2e89d5 100644 --- a/sycl/test-e2e/Matrix/runtime_query_pvc.cpp +++ b/sycl/test-e2e/Matrix/runtime_query_pvc.cpp @@ -31,6 +31,8 @@ int main() { matrix_type::fp32, matrix_type::fp32}, {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32,