Skip to content

Commit

Permalink
[SYCL][E2E][Joint Matrix] update performance test to add SLM (#15229)
Browse files Browse the repository at this point in the history
  • Loading branch information
dkhaldi committed Sep 10, 2024
1 parent 594ae74 commit 37d1d51
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 51 deletions.
23 changes: 23 additions & 0 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-ext_intel_matrix, gpu

// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
// RUN: %{run} %t_gpu_vnni.out

// TODO: add row major compilation and run once Sub-group size 32
// support becomes available in IGC for row major

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-ext_intel_matrix, gpu

// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
// RUN: %{run} %t_gpu_vnni.out

// TODO: add row major compilation and run once Sub-group size 32
// support becomes available in IGC for row major

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../../common.hpp"
#define SG_SZ 32

#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix, gpu

// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
// RUN: %{run} %t_gpu_vnni.out

// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM
// RUN: %{run} %t_gpu.out

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
19 changes: 19 additions & 0 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix, gpu

// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
// RUN: %{run} %t_gpu_vnni.out

// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM
// RUN: %{run} %t_gpu.out

// -ffp-model=precise is added to not depend on compiler defaults.

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
114 changes: 63 additions & 51 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include <random>
#include <sycl/usm.hpp>

#ifdef SLM
#include "slm_utils.hpp"
#endif

// number of test iterations
constexpr unsigned int testIterations = 100;
// start recording time after X iterations
Expand Down Expand Up @@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
std::chrono::high_resolution_clock::now();

q.submit([&](handler &h) {
#ifdef SLM
local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
local_accessor<TOperand, 2> tileB{
{KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
#endif

h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
nd_range<2>{global, cachelocal},
// loop global
Expand All @@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
[[intel::reqd_sub_group_size(SG_SZ)]]
#endif // SG_SZ
{
// sg::load and sg::store expect decorations to be ON
auto pA =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(A);
sycl::access::decorated::yes>(A);
auto pB =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(B);
sycl::access::decorated::yes>(B);
auto pC =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(C);
sycl::access::decorated::yes>(C);
auto m2 = it.get_group(0);
auto n2 = it.get_group(1);
auto m1 = it.get_local_id(0);
Expand Down Expand Up @@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
colsA, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});

#ifdef VNNI
for (int p = 0; p < prefDistance; p++)
joint_matrix_prefetch<prefRow, prefCol>(
sg,
Expand All @@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
(n2 * NCache2 * vnniFactor + pn1B * prefCol),
colsB * vnniFactor, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});
#else // VNNI
for (int p = 0; p < prefDistance; p++)
joint_matrix_prefetch<prefRow, prefCol>(
sg,
B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 +
pn1B * prefCol,
colsB, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});
#endif // VNNI
#endif // PREFETCH

joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
Expand All @@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
}
#endif // MANUAL_UNROLL

#ifdef SLM
constexpr unsigned int SGs =
(MCache2 / MCache1) * (NCache2 / NCache1);
#endif // SLM
for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
#ifdef SLM
slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
it.barrier(access::fence_space::local_space);
#endif // SLM
joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
tA[MCache1 / TM][KCache2 / KCache1]
#ifdef INIT_LIST
Expand Down Expand Up @@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
#else // MANUAL_UNROLL
for (unsigned int m = 0; m < MCache1 / TM; m++) {
#endif // MANUAL_UNROLL
#ifdef SLM
joint_matrix_load(sg, tA[m][k1],
tileA.template get_multi_ptr<
sycl::access::decorated::no>() +
(m1 * MCache1 + m * TM) * KCache2 +
k1 * TK,
KCache2);
#else // SLM
#ifdef OOB
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, tA[m][k1], pA, colsA, rowsA, colsA,
Expand All @@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
k * TK,
colsA);
#endif // OOB
#endif // SLM
#ifdef MANUAL_UNROLL
}); // m
#else // MANUAL_UNROLL
Expand All @@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
#else // MANUAL_UNROLL
for (unsigned int n = 0; n < NCache1 / TN; n++) {
#endif // MANUAL_UNROLL
#ifdef SLM
joint_matrix_load(sg, tB[n][k1],
tileB.template get_multi_ptr<
sycl::access::decorated::no>() +
(k1 * TK / vnniFactor) *
(NCache2 * vnniFactor) +
(n1 * NCache1 + n * TN) * vnniFactor,
NCache2 * vnniFactor);
#else // SLM
#ifdef OOB
#ifdef VNNI
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
colsB * vnniFactor, k * TK / vnniFactor,
(n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
#else // VNNI
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK,
n2 * NCache2 + n1 * NCache1 + n * TN);

#endif // VNNI
#else // OOB
#ifdef VNNI
joint_matrix_load(
sg, tB[n][k1],
pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
(n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
colsB * vnniFactor);
#else // VNNI
joint_matrix_load(sg, tB[n][k1],
pB + (k * TK) * (colsB) +
(n2 * NCache2 + n1 * NCache1 + n * TN),
colsB);
#endif // VNNI
#endif // OOB
#endif // SLM
#ifdef MANUAL_UNROLL
}); // n
#else // MANUAL_UNROLL
Expand Down Expand Up @@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
} // m
} // k1
#endif // MANUAL_UNROLL
#ifdef SLM
it.barrier(access::fence_space::local_space);
#endif // SLM
#ifdef PREFETCH
auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
(k2 + prefDistance) * prefCol;
Expand All @@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
sg, A + prefetch_offsetA, colsA, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});

#ifdef VNNI
auto prefetch_offsetB =
((k2 + prefDistance) * (KCache2 / vnniFactor) +
pm1B * prefRow) *
Expand All @@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
sg, B + prefetch_offsetB, colsB * vnniFactor,
layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});
#else // VNNI
auto prefetch_offsetB =
((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) +
(n2 * NCache2 + pn1B * prefCol);
if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) <
(MATRIX_SIZE * MATRIX_SIZE))
joint_matrix_prefetch<prefRow, prefCol>(
sg, B + prefetch_offsetB, colsB, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});
#endif // VNNI
#endif // PREFETCH
} // for k2
#ifdef MANUAL_UNROLL
Expand Down Expand Up @@ -411,29 +418,33 @@ int main() {
constexpr size_t NCache2 = 256;
constexpr size_t KCache2 = 32;

#ifdef VNNI
constexpr unsigned int VnniFactor = 2;
#else // VNNI
constexpr unsigned int VnniFactor = 1;
#endif // VNNI

for (unsigned int i = 0; i < combinations.size(); i++) {
if (combinations[i].nsize == 0) { // Intel AMX
constexpr size_t NCache1 = 32;
constexpr size_t KCache1 = 32;

test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
break;
}

if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
constexpr size_t NCache1 = 4 * /*TN*/ 16;
constexpr size_t KCache1 = 16;

test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>();
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
#if (!defined(SG_SZ) || SG_SZ != 32)
// These combination are not currently supported for subgroup size = 32 in
// IGC
test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
#endif
break;
}
Expand All @@ -442,9 +453,10 @@ int main() {
constexpr size_t NCache1 = 4 * /*TN*/ 8;
constexpr size_t KCache1 = 16;

test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>();
// test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
// MCache1,
// NCache1, KCache1, MCache2, NCache2, KCache2>();
break;
}
Expand Down
Loading

0 comments on commit 37d1d51

Please sign in to comment.