From c873fe2441414e36d82486347b1e4bd8ff79ab1a Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Mon, 29 Jan 2024 08:29:12 -0600 Subject: [PATCH] [SYCL][joint matrix] add missing licence to test and add combination-based query (#12489) --- .../Matrix/SG32/element_wise_all_ops.cpp | 15 +- .../Matrix/XMX8/element_wise_all_ops.cpp | 25 ---- sycl/test-e2e/Matrix/common.hpp | 31 +++++ sycl/test-e2e/Matrix/element_wise_all_ops.cpp | 16 +-- .../Matrix/element_wise_all_ops_impl.hpp | 131 ++++++++++-------- 5 files changed, 111 insertions(+), 107 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp index 91b36ee032e27..e2831636da0d8 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp @@ -5,21 +5,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: cpu, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include -#include -#include +#include "../common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -constexpr size_t SG_SZ = 32; -constexpr size_t TN = 16; +#define SG_SZ 32 #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp deleted file mode 100644 index f1f7bf84899a4..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//==------------ element_wise_all_ops.cpp - DPC++ joint_matrix-------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 -// REQUIRES: TEMPORARY_DISBLED - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -#define SG_SZ 8 -constexpr size_t TN = 8; - -#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 675261a17f3cb..155cf012d9754 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -1,3 +1,10 @@ +//==------------------ common.hpp - DPC++ joint_matrix---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// #include #include #include @@ -173,3 +180,27 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { } return true; } + +bool is_type_supported_by_device(queue q, matrix_type type) { + std::vector combinations = + q.get_device() + .get_info(); + for (int i = 0; i < combinations.size(); i++) { + if (combinations[i].atype == type) { + return true; + } + } + return false; +} + +template size_t get_sg_size(queue q) { + auto KernelID = get_kernel_id(); + auto KB = + get_kernel_bundle(q.get_context(), {KernelID}); + auto kernel = KB.get_kernel(KernelID); + + return kernel + .template get_info( + q.get_device()); +} diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops.cpp index fd3648664a52c..4ee8383154e85 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops.cpp @@ -5,20 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: cpu, gpu +// Test is flaky/timeouts on some variants of DG2 and temporary disabled. Needs +// to be investigated. +// UNSUPPORTED: gpu-intel-dg2 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -#define SG_SZ 16 -constexpr size_t TN = 16; +#include "common.hpp" #include "element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp index b11d3093bf08d..55d1162ebd3af 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp @@ -5,24 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - -static float make_fp32(bfloat16 x) { - unsigned int y = *((int *)&x); - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} - -template struct big_matrix { -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - template void assert_ops_ref(host_accessor mat, const float ref) { @@ -39,7 +21,7 @@ void assert_ops_ref(host_accessor mat, } template + size_t SUB_COLS, class kernel_name, typename OP> void verify_op_a(const T l, const T r, const float ref, OP op) { T mat[NUM_ROWS][NUM_COLS]; big_matrix big_mat((T *)&mat); @@ -47,12 +29,17 @@ void verify_op_a(const T l, const T r, const float ref, OP op) { buffer bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; - cgh.parallel_for( - nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * SG_SZ}, - {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -68,7 +55,7 @@ void verify_op_a(const T l, const T r, const float ref, OP op) { sg, sub_mat, accessMat.template get_multi_ptr() + (sg_startx * SUB_ROWS) * NUM_COLS + - sg_starty / SG_SZ * SUB_COLS, + sg_starty / sg_size * SUB_COLS, NUM_COLS); }); // parallel for }).wait(); @@ -76,20 +63,24 @@ void verify_op_a(const T l, const T r, const float ref, OP op) { } template + size_t SUB_COLS, class kernel_name, typename OP> void verify_op_c(const T l, const T r, const float ref, OP op) { T mat[NUM_ROWS][NUM_COLS]; big_matrix big_mat((T *)&mat); buffer bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS)); - queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; - cgh.parallel_for( - nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * SG_SZ}, - {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -105,85 +96,103 @@ void verify_op_c(const T l, const T r, const float ref, OP op) { sg, sub_mat, accessMat.template get_multi_ptr() + (sg_startx * SUB_ROWS) * NUM_COLS + - sg_starty / SG_SZ * SUB_COLS, + sg_starty / sg_size * SUB_COLS, NUM_COLS, layout::row_major); }); // parallel for }).wait(); assert_ops_ref(bufMat.get_host_access(read_only), ref); } +// Avoid same kernel name for different types +template class ewops_a {}; template void test_ewops_a() { - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); - verify_op_a( + verify_op_a>( T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); - verify_op_a( + verify_op_a>( T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); - verify_op_a( + verify_op_a>( T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); - verify_op_a( + verify_op_a>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); } - +// Avoid same kernel name for different types and numbers of columns +template class ewops_c {}; template void test_ewops_c() { - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); - verify_op_c( + verify_op_c>( T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); - verify_op_c( + verify_op_c>( T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); - verify_op_c( + verify_op_c>( T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); - verify_op_c( + verify_op_c>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); } int main() { static constexpr size_t TM = 8; - static constexpr size_t TK = 16; static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - - test_ewops_a(); - test_ewops_c(); - + static constexpr size_t MATRIX_N = 32; + static constexpr size_t MATRIX_K = 32; + queue q; + std::vector combinations = + q.get_device() + .get_info(); + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].atype == matrix_type::bf16) { + if (combinations[i].nsize == 0 || combinations[i].nsize == 16) { + test_ewops_a(); + test_ewops_c(); + break; + } + if (combinations[i].nsize == 8) { + test_ewops_a(); + test_ewops_c(); + break; + } + } + } return 0; }