[SYCL][E2E][Joint Matrix] update performance test to add SLM (#15229)

intel · Sep 10, 2024 · 37d1d51 · 37d1d51
1 parent 594ae74
commit 37d1d51
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 51 deletions.
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#define SG_SZ 32
+
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../../common.hpp"
+#define SG_SZ 32
+
+#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -9,6 +9,10 @@
 #include <random>
 #include <sycl/usm.hpp>
 
+#ifdef SLM
+#include "slm_utils.hpp"
+#endif
+
 // number of test iterations
 constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
@@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  std::chrono::high_resolution_clock::now();
 
  q.submit([&](handler &h) {
+#ifdef SLM
+ local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
+ local_accessor<TOperand, 2> tileB{
+ {KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
+#endif
+
  h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
  nd_range<2>{global, cachelocal},
  // loop global
@@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif // SG_SZ
  {
+ // sg::load and sg::store expect decorations to be ON
  auto pA =
  address_space_cast<sycl::access::address_space::global_space,
- sycl::access::decorated::no>(A);
+ sycl::access::decorated::yes>(A);
  auto pB =
  address_space_cast<sycl::access::address_space::global_space,
- sycl::access::decorated::no>(B);
+ sycl::access::decorated::yes>(B);
  auto pC =
  address_space_cast<sycl::access::address_space::global_space,
- sycl::access::decorated::no>(C);
+ sycl::access::decorated::yes>(C);
  auto m2 = it.get_group(0);
  auto n2 = it.get_group(1);
  auto m1 = it.get_local_id(0);
@@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  colsA, layout::row_major,
  syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
  for (int p = 0; p < prefDistance; p++)
  joint_matrix_prefetch<prefRow, prefCol>(
  sg,
@@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  (n2 * NCache2 * vnniFactor + pn1B * prefCol),
  colsB * vnniFactor, layout::row_major,
  syclex::properties{syclex::prefetch_hint_L1});
-#else // VNNI
- for (int p = 0; p < prefDistance; p++)
- joint_matrix_prefetch<prefRow, prefCol>(
- sg,
- B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 +
- pn1B * prefCol,
- colsB, layout::row_major,
- syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
 
  joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
@@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  }
 #endif // MANUAL_UNROLL
 
+#ifdef SLM
+ constexpr unsigned int SGs =
+ (MCache2 / MCache1) * (NCache2 / NCache1);
+#endif // SLM
  for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+#ifdef SLM
+ slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
+ SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
+ it.barrier(access::fence_space::local_space);
+#endif // SLM
  joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
  tA[MCache1 / TM][KCache2 / KCache1]
 #ifdef INIT_LIST
@@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else // MANUAL_UNROLL
  for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+ joint_matrix_load(sg, tA[m][k1],
+ tileA.template get_multi_ptr<
+ sycl::access::decorated::no>() +
+ (m1 * MCache1 + m * TM) * KCache2 +
+ k1 * TK,
+ KCache2);
+#else // SLM
 #ifdef OOB
  ext::intel::experimental::matrix::joint_matrix_load_checked(
  sg, tA[m][k1], pA, colsA, rowsA, colsA,
@@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  k * TK,
  colsA);
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
  }); // m
 #else // MANUAL_UNROLL
@@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else // MANUAL_UNROLL
  for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+ joint_matrix_load(sg, tB[n][k1],
+ tileB.template get_multi_ptr<
+ sycl::access::decorated::no>() +
+ (k1 * TK / vnniFactor) *
+ (NCache2 * vnniFactor) +
+ (n1 * NCache1 + n * TN) * vnniFactor,
+ NCache2 * vnniFactor);
+#else // SLM
 #ifdef OOB
-#ifdef VNNI
  ext::intel::experimental::matrix::joint_matrix_load_checked(
  sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
  colsB * vnniFactor, k * TK / vnniFactor,
  (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
-#else // VNNI
- ext::intel::experimental::matrix::joint_matrix_load_checked(
- sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK,
- n2 * NCache2 + n1 * NCache1 + n * TN);
-
-#endif // VNNI
 #else // OOB
-#ifdef VNNI
  joint_matrix_load(
  sg, tB[n][k1],
  pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
  (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
  colsB * vnniFactor);
-#else // VNNI
- joint_matrix_load(sg, tB[n][k1],
- pB + (k * TK) * (colsB) +
- (n2 * NCache2 + n1 * NCache1 + n * TN),
- colsB);
-#endif // VNNI
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
  }); // n
 #else // MANUAL_UNROLL
@@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  } // m
  } // k1
 #endif // MANUAL_UNROLL
+#ifdef SLM
+ it.barrier(access::fence_space::local_space);
+#endif // SLM
 #ifdef PREFETCH
  auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
  (k2 + prefDistance) * prefCol;
@@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  sg, A + prefetch_offsetA, colsA, layout::row_major,
  syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
  auto prefetch_offsetB =
  ((k2 + prefDistance) * (KCache2 / vnniFactor) +
  pm1B * prefRow) *
@@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
  sg, B + prefetch_offsetB, colsB * vnniFactor,
  layout::row_major,
  syclex::properties{syclex::prefetch_hint_L1});
-#else // VNNI
- auto prefetch_offsetB =
- ((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) +
- (n2 * NCache2 + pn1B * prefCol);
- if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) <
- (MATRIX_SIZE * MATRIX_SIZE))
- joint_matrix_prefetch<prefRow, prefCol>(
- sg, B + prefetch_offsetB, colsB, layout::row_major,
- syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
  } // for k2
 #ifdef MANUAL_UNROLL
@@ -411,29 +418,33 @@ int main() {
  constexpr size_t NCache2 = 256;
  constexpr size_t KCache2 = 32;
 
+#ifdef VNNI
+ constexpr unsigned int VnniFactor = 2;
+#else // VNNI
+ constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
  for (unsigned int i = 0; i < combinations.size(); i++) {
  if (combinations[i].nsize == 0) { // Intel AMX
  constexpr size_t NCache1 = 32;
  constexpr size_t KCache1 = 32;
-
- test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
- NCache1, KCache1, MCache2, NCache2, KCache2>();
+ test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
  break;
  }
 
  if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
  constexpr size_t NCache1 = 4 * /*TN*/ 16;
  constexpr size_t KCache1 = 16;
-
- test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
- KCache1, MCache2, NCache2, KCache2>();
+ test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+ NCache1, KCache1, MCache2, NCache2, KCache2>();
 #if (!defined(SG_SZ) || SG_SZ != 32)
  // These combination are not currently supported for subgroup size = 32 in
  // IGC
- test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
- NCache1, KCache1, MCache2, NCache2, KCache2>();
- test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
- NCache1, KCache1, MCache2, NCache2, KCache2>();
+ test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+ test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
 #endif
  break;
  }
@@ -442,9 +453,10 @@ int main() {
  constexpr size_t NCache1 = 4 * /*TN*/ 8;
  constexpr size_t KCache1 = 16;
 
- test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
- KCache1, MCache2, NCache2, KCache2>();
- // test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
+ test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+ NCache1, KCache1, MCache2, NCache2, KCache2>();
+ // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+ // MCache1,
  // NCache1, KCache1, MCache2, NCache2, KCache2>();
  break;
  }