From 37d1d51ca8b1efa3858b2ddb64cba0563578e4ac Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Tue, 10 Sep 2024 10:12:27 -0500
Subject: [PATCH] [SYCL][E2E][Joint Matrix] update performance test to add SLM
 (#15229)

---
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  23 ++++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  23 ++++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  19 +++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  19 +++
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 114 ++++++++++--------
 sycl/test-e2e/Matrix/slm_utils.hpp            |  70 +++++++++++
 6 files changed, 217 insertions(+), 51 deletions(-)
 create mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/slm_utils.hpp

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..bd22fa19354b1
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#define SG_SZ 32
+
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..62cef33b3beb7
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../../common.hpp"
+#define SG_SZ 32
+
+#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..d81e7dbd685ba
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..a30d6320038a8
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 1b31a993bb179..b561bd073038a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -9,6 +9,10 @@
 #include <random>
 #include <sycl/usm.hpp>
 
+#ifdef SLM
+#include "slm_utils.hpp"
+#endif
+
 // number of test iterations
 constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
@@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
       std::chrono::high_resolution_clock::now();
 
   q.submit([&](handler &h) {
+#ifdef SLM
+    local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
+    local_accessor<TOperand, 2> tileB{
+        {KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
+#endif
+
     h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
@@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
             [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif // SG_SZ
         {
+          // sg::load and sg::store expect decorations to be ON
           auto pA =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(A);
+                                 sycl::access::decorated::yes>(A);
           auto pB =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(B);
+                                 sycl::access::decorated::yes>(B);
           auto pC =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(C);
+                                 sycl::access::decorated::yes>(C);
           auto m2 = it.get_group(0);
           auto n2 = it.get_group(1);
           auto m1 = it.get_local_id(0);
@@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                 colsA, layout::row_major,
                 syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
           for (int p = 0; p < prefDistance; p++)
             joint_matrix_prefetch<prefRow, prefCol>(
                 sg,
@@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                     (n2 * NCache2 * vnniFactor + pn1B * prefCol),
                 colsB * vnniFactor, layout::row_major,
                 syclex::properties{syclex::prefetch_hint_L1});
-#else  // VNNI
-          for (int p = 0; p < prefDistance; p++)
-            joint_matrix_prefetch<prefRow, prefCol>(
-                sg,
-                B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 +
-                    pn1B * prefCol,
-                colsB, layout::row_major,
-                syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
 
           joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
@@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           }
 #endif // MANUAL_UNROLL
 
+#ifdef SLM
+          constexpr unsigned int SGs =
+              (MCache2 / MCache1) * (NCache2 / NCache1);
+#endif // SLM
           for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+#ifdef SLM
+            slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
+                           SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
             joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
                 tA[MCache1 / TM][KCache2 / KCache1]
 #ifdef INIT_LIST
@@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else  // MANUAL_UNROLL
               for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tA[m][k1],
+                                  tileA.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (m1 * MCache1 + m * TM) * KCache2 +
+                                      k1 * TK,
+                                  KCache2);
+#else // SLM
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tA[m][k1], pA, colsA, rowsA, colsA,
@@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                         k * TK,
                     colsA);
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
               }); // m
 #else             // MANUAL_UNROLL
@@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else  // MANUAL_UNROLL
               for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tB[n][k1],
+                                  tileB.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (k1 * TK / vnniFactor) *
+                                          (NCache2 * vnniFactor) +
+                                      (n1 * NCache1 + n * TN) * vnniFactor,
+                                  NCache2 * vnniFactor);
+#else // SLM
 #ifdef OOB
-#ifdef VNNI
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
                     colsB * vnniFactor, k * TK / vnniFactor,
                     (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
-#else // VNNI
-                ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK,
-                    n2 * NCache2 + n1 * NCache1 + n * TN);
-
-#endif // VNNI
 #else  // OOB
-#ifdef VNNI
                 joint_matrix_load(
                     sg, tB[n][k1],
                     pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
                         (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
                     colsB * vnniFactor);
-#else  // VNNI
-                joint_matrix_load(sg, tB[n][k1],
-                                  pB + (k * TK) * (colsB) +
-                                      (n2 * NCache2 + n1 * NCache1 + n * TN),
-                                  colsB);
-#endif // VNNI
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
               }); // n
 #else             // MANUAL_UNROLL
@@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // m
             } // k1
 #endif              // MANUAL_UNROLL
+#ifdef SLM
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
 #ifdef PREFETCH
             auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
                                     (k2 + prefDistance) * prefCol;
@@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                   sg, A + prefetch_offsetA, colsA, layout::row_major,
                   syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
             auto prefetch_offsetB =
                 ((k2 + prefDistance) * (KCache2 / vnniFactor) +
                  pm1B * prefRow) *
@@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                   sg, B + prefetch_offsetB, colsB * vnniFactor,
                   layout::row_major,
                   syclex::properties{syclex::prefetch_hint_L1});
-#else  // VNNI
-            auto prefetch_offsetB =
-                ((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) +
-                (n2 * NCache2 + pn1B * prefCol);
-            if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) <
-                (MATRIX_SIZE * MATRIX_SIZE))
-              joint_matrix_prefetch<prefRow, prefCol>(
-                  sg, B + prefetch_offsetB, colsB, layout::row_major,
-                  syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
           } // for k2
 #ifdef MANUAL_UNROLL
@@ -411,29 +418,33 @@ int main() {
   constexpr size_t NCache2 = 256;
   constexpr size_t KCache2 = 32;
 
+#ifdef VNNI
+  constexpr unsigned int VnniFactor = 2;
+#else  // VNNI
+  constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) { // Intel AMX
       constexpr size_t NCache1 = 32;
       constexpr size_t KCache1 = 32;
-
-      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
       constexpr size_t NCache1 = 4 * /*TN*/ 16;
       constexpr size_t KCache1 = 16;
-
-      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
-           KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
 #if (!defined(SG_SZ) || SG_SZ != 32)
       // These combination are not currently supported for subgroup size = 32 in
       // IGC
-      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
-      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
 #endif
       break;
     }
@@ -442,9 +453,10 @@ int main() {
       constexpr size_t NCache1 = 4 * /*TN*/ 8;
       constexpr size_t KCache1 = 16;
 
-      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
-           KCache1, MCache2, NCache2, KCache2>();
-      // test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+      // MCache1,
       //      NCache1, KCache1, MCache2, NCache2, KCache2>();
       break;
     }
diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp
new file mode 100644
index 0000000000000..28ac1264b8cc0
--- /dev/null
+++ b/sycl/test-e2e/Matrix/slm_utils.hpp
@@ -0,0 +1,70 @@
+//==------------------ slm_utils.hpp  - DPC++ joint_matrix------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+template <unsigned int colsA, unsigned int colsB, unsigned int MCache2,
+          unsigned int NCache2, unsigned int KCache2, unsigned int vnniFactor,
+          size_t SGs, typename TOperand, access::address_space space>
+inline void
+slm_read_write(multi_ptr<TOperand, space, access::decorated::yes> pA,
+               multi_ptr<TOperand, space, access::decorated::yes> pB,
+               local_accessor<TOperand, 2> tileA,
+               local_accessor<TOperand, 2> tileB, sub_group sg, unsigned int k2,
+               size_t m2, size_t n2, size_t sgSize) {
+  // Number of elements to be loaded into SLM per WI
+  size_t elemsPerLoadA = KCache2 / sgSize;
+  for (int i = 0; i < MCache2 / SGs; i++) {
+    size_t GlOffsetA =
+        (m2 * MCache2 + sg.get_group_id() * (MCache2 / SGs) + i) * colsA +
+        k2 * KCache2;
+    size_t LocOffsetA = (sg.get_group_id() * (MCache2 / SGs) + i) * KCache2;
+
+    if (elemsPerLoadA == 2) {
+      vec<TOperand, 2> slmVecA = sg.load<2>(pA + GlOffsetA);
+      sg.store<2>(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                      LocOffsetA,
+                  slmVecA);
+    } else if (elemsPerLoadA == 4) {
+      vec<TOperand, 4> slmVecA = sg.load<4>(pA + GlOffsetA);
+      sg.store<4>(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                      LocOffsetA,
+                  slmVecA);
+    } else if (elemsPerLoadA == 1) {
+      TOperand slmScaA = sg.load(pA + GlOffsetA);
+
+      sg.store(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                   LocOffsetA,
+               slmScaA);
+    } else
+      assert(elemsPerLoadA == 1 || elemsPerLoadA == 2 || elemsPerLoadA == 4);
+  }
+  // how much each SG will load to SLM --> has to be contiguous
+  // NCache2*KCache2/(SGs*SG_SIZE) = 16
+  size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize);
+  size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize);
+  size_t GlOffsetB =
+      (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) *
+          (colsB * vnniFactor) +
+      n2 * NCache2 * vnniFactor +
+      (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize);
+  size_t LocOffsetB =
+      ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor +
+      (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize;
+  if (elemsPerLoadB == 16) {
+    vec<TOperand, 16> slmVecB = sg.load<16>(pB + GlOffsetB);
+
+    sg.store<16>(tileB.template get_multi_ptr<sycl::access::decorated::no>() +
+                     LocOffsetB,
+                 slmVecB);
+  } else if (elemsPerLoadB == 8) {
+    vec<TOperand, 8> slmVecB = sg.load<8>(pB + GlOffsetB);
+
+    sg.store<8>(tileB.template get_multi_ptr<sycl::access::decorated::no>() +
+                    LocOffsetB,
+                slmVecB);
+  } else
+    assert(elemsPerLoadB == 8 || elemsPerLoadB == 16);
+}