intel · steffenlarsen · Mar 22, 2024 · Feb 1, 2024 · Mar 21, 2024 · Mar 21, 2024
@@ -5,19 +5,16 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include <iostream>
-#include <sycl/sycl.hpp>
+#include "../common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-constexpr size_t SG_SZ = 32;
-constexpr size_t TN = 16;
+#define SG_SZ 32
 
 #include "../element_wise_ops_impl.hpp"
@@ -42,21 +42,21 @@ float make_fp32(bfloat16 x) {
  return *res;
 }
 
-template <typename Ta, typename Tb, typename Tc, unsigned int VF = 1>
+template <typename Ta, typename Tb, typename Tc, unsigned int VF = 1,
+ typename F = std::nullptr_t>
 void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
  bool transpose_c = false, bool colmajor_a = false,
- bool colmajor_b = false) {
+ bool colmajor_b = false, F &&lambda = {}) {
  for (unsigned int m = 0; m < M; m++) {
  for (unsigned int n = 0; n < N; n++) {
- for (unsigned int k = 0; k < K; k++) {
+ int c_ind = transpose_c ? (n * M + m) : m * N + n;
+ Tc acc = *(C + c_ind);
 
+ for (unsigned int k = 0; k < K; k++) {
  int a_ind = colmajor_a ? (k * M + m) : m * K + k;
  int b_ind = colmajor_b ? (n * K + k) : k * N + n;
- int c_ind = transpose_c ? (n * M + m) : m * N + n;
-
  Ta *va = (Ta *)(A + a_ind * VF);
  Tb *vb = (Tb *)(B + b_ind * VF);
- Tc acc = *(C + c_ind);
 
  for (unsigned int i = 0; i < VF; i++) {
  if constexpr (std::is_same_v<Ta, bfloat16> &&
@@ -74,9 +74,12 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
  else
  assert(false && "Unsupported type in matrix_multiply_ref.");
  }
+ }
 
- *(C + c_ind) = acc;
+ if constexpr (!std::is_same_v<F, std::nullptr_t>) {
+ lambda(acc);
  }
+ *(C + c_ind) = acc;
  }
  }
 }
@@ -132,8 +135,7 @@ void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) {
  if constexpr (std::is_same_v<T, bfloat16> || std::is_same_v<T, float> ||
  std::is_same_v<T, double>) {
  src[i * cols + j] = T(fdistr(dev));
- } else if constexpr (std::is_same_v<T, int8_t> ||
- std::is_same_v<T, int32_t>) {
+ } else if constexpr (std::is_integral_v<T>) {
  src[i * cols + j] = T(idistr(dev));
  } else {
  assert(false && "Unsupported type in matrix_rand.");
@@ -170,8 +172,9 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
  }
  } else if constexpr (exact || std::is_same_v<T1, int32_t>) {
  if (src[i * cols + j] != ref[i * cols + j]) {
- std::cout << "Incorrect result in matrix." << "i: " << i
- << ", j: " << j << ", Ref: " << ref[i * cols + j]
+ std::cout << "Incorrect result in matrix."
+ << "i: " << i << ", j: " << j
+ << ", Ref: " << ref[i * cols + j]
  << ", Val: " << src[i * cols + j] << "\n";
  return false;
  }

@@ -14,5 +14,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
 #include "element_wise_all_ops_impl.hpp"
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 template <typename T, size_t NUM_ROWS, size_t NUM_COLS>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> mat,
  const float ref) {
@@ -105,8 +106,11 @@ void verify_op_c(const T l, const T r, const float ref, OP op) {
 
 // Avoid same kernel name for different types
 template <typename T, class name> class ewops_a {};
-template <typename T, size_t NROWS, size_t NCOLS, size_t SROWS, size_t SCOLS>
-void test_ewops_a() {
+template <typename T, size_t SROWS, size_t SCOLS> void test_ewops_a() {
+ std::cout << "Test A " << SROWS << "x" << SCOLS << "\n";
+
+ static constexpr size_t NROWS = SROWS * 2;
+ static constexpr size_t NCOLS = SCOLS * 2;
 
  verify_op_a<T, NROWS, NCOLS, SROWS, SCOLS, ewops_a<T, class a_add>>(
  T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; });
@@ -135,64 +139,87 @@ void test_ewops_a() {
  T(5.0), T(2.0), 2.0,
  [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); });
 }
+
 // Avoid same kernel name for different types and numbers of columns
-template <typename T, size_t COLS, class name> class ewops_c {};
-template <typename T, size_t NROWS, size_t NCOLS, size_t SROWS, size_t SCOLS>
-void test_ewops_c() {
+template <typename T, size_t ROWS, size_t COLS, class name> class ewops_c {};
+template <typename T, size_t SROWS, size_t SCOLS> void test_ewops_c() {
+ std::cout << "Test C " << SROWS << "x" << SCOLS << "\n";
 
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_add>>(
+ static constexpr size_t NROWS = SROWS * 2;
+ static constexpr size_t NCOLS = SCOLS * 2;
+
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_add>>(
  T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_sub>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_sub>>(
  T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_mul>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_mul>>(
  T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_div>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_div>>(
  T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; });
  verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
- ewops_c<T, SCOLS, class c_logical>>(
+ ewops_c<T, SROWS, SCOLS, class c_logical>>(
  T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_eq>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_eq>>(
  T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_ne>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_ne>>(
  T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_gt>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_gt>>(
  T(5.0), T(2.0), 3.0,
  [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_lt>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_lt>>(
  T(5.0), T(2.0), 2.0,
  [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_ge>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_ge>>(
  T(5.0), T(2.0), 3.0,
  [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); });
- verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS, ewops_c<T, SCOLS, class c_le>>(
+ verify_op_c<T, NROWS, NCOLS, SROWS, SCOLS,
+ ewops_c<T, SROWS, SCOLS, class c_le>>(
  T(5.0), T(2.0), 2.0,
  [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); });
 }
 
 int main() {
- static constexpr size_t TM = 8;
-
- static constexpr size_t MATRIX_M = TM * 2;
- static constexpr size_t MATRIX_N = 32;
- static constexpr size_t MATRIX_K = 32;
  queue q;
  std::vector<combination> combinations =
  q.get_device()
  .get_info<sycl::ext::oneapi::experimental::info::device::
  matrix_combinations>();
+
  for (unsigned int i = 0; i < combinations.size(); i++) {
  if (combinations[i].atype == matrix_type::bf16) {
- if (combinations[i].nsize == 0 || combinations[i].nsize == 16) {
- test_ewops_a<bfloat16, MATRIX_M, MATRIX_K, TM, 16>();
- test_ewops_c<float, MATRIX_M, MATRIX_N, TM, 16>();
- break;
+
+ if (combinations[i].nsize == 0 ||
+ (combinations[i].msize == 0 && combinations[i].nsize == 16)) {
+ test_ewops_a<bfloat16, 8, 16>();
+ test_ewops_c<float, 8, 16>();
+ }
+
+ if (combinations[i].msize == 16 && combinations[i].nsize == 16) {
+ test_ewops_c<float, 16, 16>();
+ }
+
+// This combination is not currently supported for sub group size = 32 in IGC
+#if (!defined(SG_SZ) || SG_SZ != 32)
+ if (combinations[i].msize == 32 && combinations[i].nsize == 64) {
+ test_ewops_c<float, 32, 64>();
  }
+#endif
+
  if (combinations[i].nsize == 8) {
- test_ewops_a<bfloat16, MATRIX_M, MATRIX_K, TM, 16>();
- test_ewops_c<float, MATRIX_M, MATRIX_N, TM, 8>();
- break;
+ test_ewops_a<bfloat16, 8, 16>();
+ test_ewops_c<float, 8, 8>();
  }
  }
  }
+
  return 0;
 }
@@ -5,18 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include <iostream>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
-constexpr size_t TN = 16;
-
+#include "common.hpp"
 #include "element_wise_ops_impl.hpp"