From 37d1d51ca8b1efa3858b2ddb64cba0563578e4ac Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 10 Sep 2024 10:12:27 -0500 Subject: [PATCH] [SYCL][E2E][Joint Matrix] update performance test to add SLM (#15229) --- .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 23 ++++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 23 ++++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 19 +++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 19 +++ .../joint_matrix_bf16_fill_k_cache_impl.hpp | 114 ++++++++++-------- sycl/test-e2e/Matrix/slm_utils.hpp | 70 +++++++++++ 6 files changed, 217 insertions(+), 51 deletions(-) create mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/slm_utils.hpp diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..bd22fa19354b1 --- /dev/null +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,23 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// TODO: add row major compilation and run once Sub-group size 32 +// support becomes available in IGC for row major + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..62cef33b3beb7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,23 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// TODO: add row major compilation and run once Sub-group size 32 +// support becomes available in IGC for row major + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../../common.hpp" +#define SG_SZ 32 + +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..d81e7dbd685ba --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,19 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM +// RUN: %{run} %t_gpu.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..a30d6320038a8 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,19 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM +// RUN: %{run} %t_gpu.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 1b31a993bb179..b561bd073038a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -9,6 +9,10 @@ #include #include +#ifdef SLM +#include "slm_utils.hpp" +#endif + // number of test iterations constexpr unsigned int testIterations = 100; // start recording time after X iterations @@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::now(); q.submit([&](handler &h) { +#ifdef SLM + local_accessor tileA{{MCache2, KCache2}, h}; + local_accessor tileB{ + {KCache2 / vnniFactor, NCache2 * vnniFactor}, h}; +#endif + h.parallel_for>( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global @@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { [[intel::reqd_sub_group_size(SG_SZ)]] #endif // SG_SZ { + // sg::load and sg::store expect decorations to be ON auto pA = address_space_cast(A); + sycl::access::decorated::yes>(A); auto pB = address_space_cast(B); + sycl::access::decorated::yes>(B); auto pC = address_space_cast(C); + sycl::access::decorated::yes>(C); auto m2 = it.get_group(0); auto n2 = it.get_group(1); auto m1 = it.get_local_id(0); @@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#ifdef VNNI for (int p = 0; p < prefDistance; p++) joint_matrix_prefetch( sg, @@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { (n2 * NCache2 * vnniFactor + pn1B * prefCol), colsB * vnniFactor, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#else // VNNI - for (int p = 0; p < prefDistance; p++) - joint_matrix_prefetch( - sg, - B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 + - pn1B * prefCol, - colsB, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // VNNI #endif // PREFETCH joint_matrix @@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } #endif // MANUAL_UNROLL +#ifdef SLM + constexpr unsigned int SGs = + (MCache2 / MCache1) * (NCache2 / NCache1); +#endif // SLM for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { +#ifdef SLM + slm_read_write(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize); + it.barrier(access::fence_space::local_space); +#endif // SLM joint_matrix tA[MCache1 / TM][KCache2 / KCache1] #ifdef INIT_LIST @@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { #else // MANUAL_UNROLL for (unsigned int m = 0; m < MCache1 / TM; m++) { #endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tA[m][k1], + tileA.template get_multi_ptr< + sycl::access::decorated::no>() + + (m1 * MCache1 + m * TM) * KCache2 + + k1 * TK, + KCache2); +#else // SLM #ifdef OOB ext::intel::experimental::matrix::joint_matrix_load_checked( sg, tA[m][k1], pA, colsA, rowsA, colsA, @@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { k * TK, colsA); #endif // OOB +#endif // SLM #ifdef MANUAL_UNROLL }); // m #else // MANUAL_UNROLL @@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { #else // MANUAL_UNROLL for (unsigned int n = 0; n < NCache1 / TN; n++) { #endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tB[n][k1], + tileB.template get_multi_ptr< + sycl::access::decorated::no>() + + (k1 * TK / vnniFactor) * + (NCache2 * vnniFactor) + + (n1 * NCache1 + n * TN) * vnniFactor, + NCache2 * vnniFactor); +#else // SLM #ifdef OOB -#ifdef VNNI ext::intel::experimental::matrix::joint_matrix_load_checked( sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor, colsB * vnniFactor, k * TK / vnniFactor, (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor); -#else // VNNI - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK, - n2 * NCache2 + n1 * NCache1 + n * TN); - -#endif // VNNI #else // OOB -#ifdef VNNI joint_matrix_load( sg, tB[n][k1], pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, colsB * vnniFactor); -#else // VNNI - joint_matrix_load(sg, tB[n][k1], - pB + (k * TK) * (colsB) + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB); -#endif // VNNI #endif // OOB +#endif // SLM #ifdef MANUAL_UNROLL }); // n #else // MANUAL_UNROLL @@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } // m } // k1 #endif // MANUAL_UNROLL +#ifdef SLM + it.barrier(access::fence_space::local_space); +#endif // SLM #ifdef PREFETCH auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + (k2 + prefDistance) * prefCol; @@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { sg, A + prefetch_offsetA, colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#ifdef VNNI auto prefetch_offsetB = ((k2 + prefDistance) * (KCache2 / vnniFactor) + pm1B * prefRow) * @@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { sg, B + prefetch_offsetB, colsB * vnniFactor, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#else // VNNI - auto prefetch_offsetB = - ((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) + - (n2 * NCache2 + pn1B * prefCol); - if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) < - (MATRIX_SIZE * MATRIX_SIZE)) - joint_matrix_prefetch( - sg, B + prefetch_offsetB, colsB, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // VNNI #endif // PREFETCH } // for k2 #ifdef MANUAL_UNROLL @@ -411,29 +418,33 @@ int main() { constexpr size_t NCache2 = 256; constexpr size_t KCache2 = 32; +#ifdef VNNI + constexpr unsigned int VnniFactor = 2; +#else // VNNI + constexpr unsigned int VnniFactor = 1; +#endif // VNNI + for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX constexpr size_t NCache1 = 32; constexpr size_t KCache1 = 32; - - test(); + test(); break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc constexpr size_t NCache1 = 4 * /*TN*/ 16; constexpr size_t KCache1 = 16; - - test(); + test(); #if (!defined(SG_SZ) || SG_SZ != 32) // These combination are not currently supported for subgroup size = 32 in // IGC - test(); - test(); + test(); + test(); #endif break; } @@ -442,9 +453,10 @@ int main() { constexpr size_t NCache1 = 4 * /*TN*/ 8; constexpr size_t KCache1 = 16; - test(); - // test(); + // test(); break; } diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp new file mode 100644 index 0000000000000..28ac1264b8cc0 --- /dev/null +++ b/sycl/test-e2e/Matrix/slm_utils.hpp @@ -0,0 +1,70 @@ +//==------------------ slm_utils.hpp - DPC++ joint_matrix------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +template +inline void +slm_read_write(multi_ptr pA, + multi_ptr pB, + local_accessor tileA, + local_accessor tileB, sub_group sg, unsigned int k2, + size_t m2, size_t n2, size_t sgSize) { + // Number of elements to be loaded into SLM per WI + size_t elemsPerLoadA = KCache2 / sgSize; + for (int i = 0; i < MCache2 / SGs; i++) { + size_t GlOffsetA = + (m2 * MCache2 + sg.get_group_id() * (MCache2 / SGs) + i) * colsA + + k2 * KCache2; + size_t LocOffsetA = (sg.get_group_id() * (MCache2 / SGs) + i) * KCache2; + + if (elemsPerLoadA == 2) { + vec slmVecA = sg.load<2>(pA + GlOffsetA); + sg.store<2>(tileA.template get_multi_ptr() + + LocOffsetA, + slmVecA); + } else if (elemsPerLoadA == 4) { + vec slmVecA = sg.load<4>(pA + GlOffsetA); + sg.store<4>(tileA.template get_multi_ptr() + + LocOffsetA, + slmVecA); + } else if (elemsPerLoadA == 1) { + TOperand slmScaA = sg.load(pA + GlOffsetA); + + sg.store(tileA.template get_multi_ptr() + + LocOffsetA, + slmScaA); + } else + assert(elemsPerLoadA == 1 || elemsPerLoadA == 2 || elemsPerLoadA == 4); + } + // how much each SG will load to SLM --> has to be contiguous + // NCache2*KCache2/(SGs*SG_SIZE) = 16 + size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize); + size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize); + size_t GlOffsetB = + (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) * + (colsB * vnniFactor) + + n2 * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); + size_t LocOffsetB = + ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize; + if (elemsPerLoadB == 16) { + vec slmVecB = sg.load<16>(pB + GlOffsetB); + + sg.store<16>(tileB.template get_multi_ptr() + + LocOffsetB, + slmVecB); + } else if (elemsPerLoadB == 8) { + vec slmVecB = sg.load<8>(pB + GlOffsetB); + + sg.store<8>(tileB.template get_multi_ptr() + + LocOffsetB, + slmVecB); + } else + assert(elemsPerLoadB == 8 || elemsPerLoadB == 16); +}