From a184cbc9cc0dcfe82c7ed1e2ad0e35ccbdd97a87 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Thu, 19 Sep 2024 15:07:08 -0700
Subject: [PATCH] add test for runtime matrix dimension

---
 ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp |  22 ++
 ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 249 ++++++++++++++++++
 2 files changed, 271 insertions(+)
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
new file mode 100644
index 0000000000000..30ba7db3d227e
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -0,0 +1,22 @@
+//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// https://jira.devtools.intel.com/browse/GSD-9716
+// XFAIL: arch-intel_gpu_pvc
+
+// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
+// RUN: %{run} %t_runtime_dim_vnni.out
+
+// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
+// RUN: %{run} %t_runtime_dim.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
new file mode 100644
index 0000000000000..cbfb4de8f18aa
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
@@ -0,0 +1,249 @@
+//------------------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------===//
+
+#include <random>
+#include <sycl/usm.hpp>
+
+// number of test iterations
+constexpr unsigned int testIterations = 100;
+// start recording time after X iterations
+constexpr unsigned int recordThresh = 10;
+
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, 
+                    size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
+
+  // throw error if padding needed
+  assert(colsA == rowsB);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
+  // submit main kernel
+  std::chrono::high_resolution_clock::time_point start =
+      std::chrono::high_resolution_clock::now();
+
+  q.submit([&](handler &h) {
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
+        nd_range<2>{global, cachelocal},
+        // loop global
+        // loop localrange
+        [=](nd_item<2> it)
+        {
+          // sg::load and sg::store expect decorations to be ON
+          auto pA =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(A);
+          auto pB =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(B);
+          auto pC =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(C);
+          auto m2 = it.get_group(0);
+          auto n2 = it.get_group(1);
+          auto m1 = it.get_local_id(0);
+          auto n1 = it.get_local_id(1) / sgSize;
+          auto sg = it.get_sub_group();
+
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN];
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_fill(sg, tC[m][n], 0);
+            }
+          }
+
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1];
+#ifdef VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::ext_intel_packed>
+                tB[NCache1 / TN][KCache2 / KCache1];
+#else  // VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::row_major>
+                tB[NCache1 / TN][KCache2 / KCache1];
+#endif // VNNI
+
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
+                for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                    joint_matrix_load(
+                        sg, tA[m][k1],
+                        pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                            k * TK,
+                        colsA);
+                }
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                    joint_matrix_load(
+                        sg, tB[n][k1],
+                        pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
+                            (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
+                        colsB * vnniFactor);
+                } // n
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
+                                   tC[m][n]);
+                } // n
+              } // m
+            } // k1
+          } // for k2
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_store(
+                  sg, tC[m][n],
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
+                  colsB, layout::row_major);
+            } // n
+          } // m
+        });     // parallel_for
+  });           // queue.submit
+
+  if (i == testIterations - 1)
+    q.wait();
+  std::chrono::duration<double, std::milli> duration =
+      std::chrono::high_resolution_clock::now() - start;
+
+  return duration.count();
+}
+
+template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test(size_t matrix_size) {
+  assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
+         "invalid matrix size");
+  assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
+         (matrix_size % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
+
+  queue q;
+  T *A = malloc_shared<T>(matrix_size * matrix_size, q);
+  T *B = malloc_shared<T>(matrix_size * matrix_size, q);
+  TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
+  TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);
+
+  matrix_rand<T>(matrix_size, matrix_size, A, T(1));
+  matrix_rand<T>(matrix_size, matrix_size, B, T(1));
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
+                                        matrix_size);
+
+#ifdef VNNI
+  T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
+  matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
+  free(B, q);
+  B = vnniB;
+#endif
+
+  // run testIterations time, aggregate and calculate average run time
+  double totalDuration = 0;
+  for (unsigned int i = 0; i < testIterations; i++) {
+
+  double duration =
+        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                    KCache1, MCache2, NCache2, KCache2>
+                    (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size);
+
+    if (i >= recordThresh) {
+      totalDuration += duration;
+    }
+  }
+
+  assert(matrix_compare(matrix_size, matrix_size, C, refC));
+
+  double msecPerMatrixMul =
+      totalDuration / static_cast<double>(testIterations - recordThresh);
+  double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
+                  (msecPerMatrixMul / 1000.f);
+
+  std::cout << "DONE for size " << matrix_size << std::endl;
+  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
+
+  free(A, q);
+  free(B, q);
+  free(C, q);
+  free(refC, q);
+}
+
+int main(int argc, char *argv[]) {
+  size_t matrix_size;
+  matrix_size = std::stoul(argv[1]);
+
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+#ifdef VNNI
+  constexpr unsigned int VnniFactor = 2;
+#else  // VNNI
+  constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+      // MCache1,
+      //      NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      break;
+    }
+  }
+  return 0;
+}