From c873fe2441414e36d82486347b1e4bd8ff79ab1a Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Mon, 29 Jan 2024 08:29:12 -0600
Subject: [PATCH] [SYCL][joint matrix] add missing licence to test and add
 combination-based query (#12489)

---
 .../Matrix/SG32/element_wise_all_ops.cpp      |  15 +-
 .../Matrix/XMX8/element_wise_all_ops.cpp      |  25 ----
 sycl/test-e2e/Matrix/common.hpp               |  31 +++++
 sycl/test-e2e/Matrix/element_wise_all_ops.cpp |  16 +--
 .../Matrix/element_wise_all_ops_impl.hpp      | 131 ++++++++++--------
 5 files changed, 111 insertions(+), 107 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp
index 91b36ee032e27..e2831636da0d8 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp
@@ -5,21 +5,16 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: cpu, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include <iostream>
-#include <random>
-#include <sycl/sycl.hpp>
+#include "../common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
-constexpr size_t SG_SZ = 32;
-constexpr size_t TN = 16;
+#define SG_SZ 32
 
 #include "../element_wise_all_ops_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp
deleted file mode 100644
index f1f7bf84899a4..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//==------------ element_wise_all_ops.cpp  - DPC++ joint_matrix-------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-// REQUIRES: TEMPORARY_DISBLED
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-#include <random>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
-#define SG_SZ 8
-constexpr size_t TN = 8;
-
-#include "../element_wise_all_ops_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp
index 675261a17f3cb..155cf012d9754 100644
--- a/sycl/test-e2e/Matrix/common.hpp
+++ b/sycl/test-e2e/Matrix/common.hpp
@@ -1,3 +1,10 @@
+//==------------------ common.hpp  - DPC++ joint_matrix---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 #include <cmath>
 #include <iostream>
 #include <random>
@@ -173,3 +180,27 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
   }
   return true;
 }
+
+bool is_type_supported_by_device(queue q, matrix_type type) {
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+  for (int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].atype == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename KernelName> size_t get_sg_size(queue q) {
+  auto KernelID = get_kernel_id<KernelName>();
+  auto KB =
+      get_kernel_bundle<bundle_state::executable>(q.get_context(), {KernelID});
+  auto kernel = KB.get_kernel(KernelID);
+
+  return kernel
+      .template get_info<info::kernel_device_specific::max_sub_group_size>(
+          q.get_device());
+}
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops.cpp
index fd3648664a52c..4ee8383154e85 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops.cpp
@@ -5,20 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: cpu, gpu
+// Test is flaky/timeouts on some variants of DG2 and temporary disabled. Needs
+// to be investigated.
+// UNSUPPORTED: gpu-intel-dg2
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include <iostream>
-#include <random>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
-#define SG_SZ 16
-constexpr size_t TN = 16;
+#include "common.hpp"
 
 #include "element_wise_all_ops_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp
index b11d3093bf08d..55d1162ebd3af 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp
@@ -5,24 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-static float make_fp32(bfloat16 x) {
-  unsigned int y = *((int *)&x);
-  y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
-  return *res;
-}
-
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
-public:
-  T *mat;
-
-public:
-  T *get_data() { return mat; }
-  void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {}
-};
-
 template <typename T, size_t NUM_ROWS, size_t NUM_COLS>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> mat,
                     const float ref) {
@@ -39,7 +21,7 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read> mat,
 }
 
 template <typename T, size_t NUM_ROWS, size_t NUM_COLS, size_t SUB_ROWS,
-          size_t SUB_COLS, typename OP>
+          size_t SUB_COLS, class kernel_name, typename OP>
 void verify_op_a(const T l, const T r, const float ref, OP op) {
   T mat[NUM_ROWS][NUM_COLS];
   big_matrix<T, NUM_ROWS, NUM_COLS> big_mat((T *)&mat);
@@ -47,12 +29,17 @@ void verify_op_a(const T l, const T r, const float ref, OP op) {
   buffer<T, 2> bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS));
 
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accessMat{bufMat, cgh, sycl::read_write};
-     cgh.parallel_for(
-         nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * SG_SZ},
-                     {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<kernel_name>(
+         nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size},
+                     {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -68,7 +55,7 @@ void verify_op_a(const T l, const T r, const float ref, OP op) {
                sg, sub_mat,
                accessMat.template get_multi_ptr<access::decorated::no>() +
                    (sg_startx * SUB_ROWS) * NUM_COLS +
-                   sg_starty / SG_SZ * SUB_COLS,
+                   sg_starty / sg_size * SUB_COLS,
                NUM_COLS);
          }); // parallel for
    }).wait();
@@ -76,20 +63,24 @@ void verify_op_a(const T l, const T r, const float ref, OP op) {
 }
 
 template <typename T, size_t NUM_ROWS, size_t NUM_COLS, size_t SUB_ROWS,
-          size_t SUB_COLS, typename OP>
+          size_t SUB_COLS, class kernel_name, typename OP>
 void verify_op_c(const T l, const T r, const float ref, OP op) {
   T mat[NUM_ROWS][NUM_COLS];
   big_matrix<T, NUM_ROWS, NUM_COLS> big_mat((T *)&mat);
 
   buffer<T, 2> bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS));
-
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accessMat{bufMat, cgh, sycl::read_write};
-     cgh.parallel_for(
-         nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * SG_SZ},
-                     {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<kernel_name>(
+         nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size},
+                     {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -105,85 +96,103 @@ void verify_op_c(const T l, const T r, const float ref, OP op) {
                sg, sub_mat,
                accessMat.template get_multi_ptr<access::decorated::no>() +
                    (sg_startx * SUB_ROWS) * NUM_COLS +
-                   sg_starty / SG_SZ * SUB_COLS,
+                   sg_starty / sg_size * SUB_COLS,
                NUM_COLS, layout::row_major);
          }); // parallel for
    }).wait();
   assert_ops_ref<T, NUM_ROWS, NUM_COLS>(bufMat.get_host_access(read_only), ref);
 }
 
+// Avoid same kernel name for different types
+template <typename T, class name> class ewops_a {};
 template <typename T, size_t NROWS, size_t NCOLS, size_t SROWS, size_t SCOLS>
 void test_ewops_a() {
 
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_add>>(
       T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_sub>>(
       T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_mul>>(
       T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_div>>(
       T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_logical>>(
       T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_eq>>(
       T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_ne>>(
       T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_gt>>(
       T(5.0), T(2.0), 3.0,
       [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_lt>>(
       T(5.0), T(2.0), 2.0,
       [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_ge>>(
       T(5.0), T(2.0), 3.0,
       [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); });
-  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_le>>(
       T(5.0), T(2.0), 2.0,
       [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); });
 }
-
+// Avoid same kernel name for different types and numbers of columns
+template <typename T, size_t COLS, class name> class ewops_c {};
 template <typename T, size_t NROWS, size_t NCOLS, size_t SROWS, size_t SCOLS>
 void test_ewops_c() {
 
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_add>>(
       T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_sub>>(
       T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_mul>>(
       T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_div>>(
       T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+              ewops_c<T, SCOLS, class c_logical>>(
       T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_eq>>(
       T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_ne>>(
       T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_gt>>(
       T(5.0), T(2.0), 3.0,
       [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_lt>>(
       T(5.0), T(2.0), 2.0,
       [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_ge>>(
       T(5.0), T(2.0), 3.0,
       [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); });
-  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS>(
+  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_le>>(
       T(5.0), T(2.0), 2.0,
       [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); });
 }
 
 int main() {
   static constexpr size_t TM = 8;
-  static constexpr size_t TK = 16;
 
   static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  static constexpr size_t MATRIX_K = TK * 2;
-
-  test_ewops_a<bfloat16, MATRIX_M, MATRIX_K, TM, TK>();
-  test_ewops_c<float, MATRIX_M, MATRIX_N, TM, TN>();
-
+  static constexpr size_t MATRIX_N = 32;
+  static constexpr size_t MATRIX_K = 32;
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].atype == matrix_type::bf16) {
+      if (combinations[i].nsize == 0 || combinations[i].nsize == 16) {
+        test_ewops_a<bfloat16, MATRIX_M, MATRIX_K, TM, 16>();
+        test_ewops_c<float, MATRIX_M, MATRIX_N, TM, 16>();
+        break;
+      }
+      if (combinations[i].nsize == 8) {
+        test_ewops_a<bfloat16, MATRIX_M, MATRIX_K, TM, 16>();
+        test_ewops_c<float, MATRIX_M, MATRIX_N, TM, 8>();
+        break;
+      }
+    }
+  }
   return 0;
 }