Skip to content

Commit

Permalink
add test for runtime matrix dimension
Browse files Browse the repository at this point in the history
  • Loading branch information
YixingZhang007 committed Sep 19, 2024
1 parent b56a34c commit a184cbc
Show file tree
Hide file tree
Showing 2 changed files with 271 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix

// https://jira.devtools.intel.com/browse/GSD-9716
// XFAIL: arch-intel_gpu_pvc

// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
// RUN: %{run} %t_runtime_dim_vnni.out

// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
// RUN: %{run} %t_runtime_dim.out

// -ffp-model=precise is added to not depend on compiler defaults.

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
//------------------------------------------------------------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===-------------------------------------------------------------------------===//

#include <random>
#include <sycl/usm.hpp>

// number of test iterations
constexpr unsigned int testIterations = 100;
// start recording time after X iterations
constexpr unsigned int recordThresh = 10;

template <size_t TM, size_t TN, size_t TK> class MatMul;

template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i,
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {

size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};

// throw error if padding needed
assert(colsA == rowsB);
assert(rowsA % TM == 0);
assert(colsA % TK == 0);
assert(colsB % TN == 0);
// submit main kernel
std::chrono::high_resolution_clock::time_point start =
std::chrono::high_resolution_clock::now();

q.submit([&](handler &h) {
h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
nd_range<2>{global, cachelocal},
// loop global
// loop localrange
[=](nd_item<2> it)
{
// sg::load and sg::store expect decorations to be ON
auto pA =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::yes>(A);
auto pB =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::yes>(B);
auto pC =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::yes>(C);
auto m2 = it.get_group(0);
auto n2 = it.get_group(1);
auto m1 = it.get_local_id(0);
auto n1 = it.get_local_id(1) / sgSize;
auto sg = it.get_sub_group();

joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
tC[MCache1 / TM][NCache1 / TN];

for (unsigned int m = 0; m < MCache1 / TM; m++) {
for (unsigned int n = 0; n < NCache1 / TN; n++) {
joint_matrix_fill(sg, tC[m][n], 0);
}
}

for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
tA[MCache1 / TM][KCache2 / KCache1];
#ifdef VNNI
joint_matrix<sub_group, TOperand, use::b, TK, TN,
layout::ext_intel_packed>
tB[NCache1 / TN][KCache2 / KCache1];
#else // VNNI
joint_matrix<sub_group, TOperand, use::b, TK, TN,
layout::row_major>
tB[NCache1 / TN][KCache2 / KCache1];
#endif // VNNI

for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
for (unsigned int m = 0; m < MCache1 / TM; m++) {
joint_matrix_load(
sg, tA[m][k1],
pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
k * TK,
colsA);
}
for (unsigned int n = 0; n < NCache1 / TN; n++) {
joint_matrix_load(
sg, tB[n][k1],
pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
(n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
colsB * vnniFactor);
} // n
for (unsigned int m = 0; m < MCache1 / TM; m++) {
for (unsigned int n = 0; n < NCache1 / TN; n++) {
joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
tC[m][n]);
} // n
} // m
} // k1
} // for k2

for (unsigned int m = 0; m < MCache1 / TM; m++) {
for (unsigned int n = 0; n < NCache1 / TN; n++) {
joint_matrix_store(
sg, tC[m][n],
pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
(n2 * NCache2 + n1 * NCache1 + n * TN),
colsB, layout::row_major);
} // n
} // m
}); // parallel_for
}); // queue.submit

if (i == testIterations - 1)
q.wait();
std::chrono::duration<double, std::milli> duration =
std::chrono::high_resolution_clock::now() - start;

return duration.count();
}

template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
void test(size_t matrix_size) {
assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
"invalid matrix size");
assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
(matrix_size % TK) == 0 &&
"invalid matrix size detected: not a multiple of <TM,TN,TK>");

std::cout << "Testing: " << TM << " x " << TN << " x " << TK
<< " [TM x TN x TK]" << std::endl;

queue q;
T *A = malloc_shared<T>(matrix_size * matrix_size, q);
T *B = malloc_shared<T>(matrix_size * matrix_size, q);
TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);

matrix_rand<T>(matrix_size, matrix_size, A, T(1));
matrix_rand<T>(matrix_size, matrix_size, B, T(1));

matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
matrix_size);

#ifdef VNNI
T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
free(B, q);
B = vnniB;
#endif

// run testIterations time, aggregate and calculate average run time
double totalDuration = 0;
for (unsigned int i = 0; i < testIterations; i++) {

double duration =
joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>
(A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size);

if (i >= recordThresh) {
totalDuration += duration;
}
}

assert(matrix_compare(matrix_size, matrix_size, C, refC));

double msecPerMatrixMul =
totalDuration / static_cast<double>(testIterations - recordThresh);
double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
(msecPerMatrixMul / 1000.f);

std::cout << "DONE for size " << matrix_size << std::endl;
std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;

free(A, q);
free(B, q);
free(C, q);
free(refC, q);
}

int main(int argc, char *argv[]) {
size_t matrix_size;
matrix_size = std::stoul(argv[1]);

queue q;
std::vector<combination> combinations =
q.get_device()
.get_info<sycl::ext::oneapi::experimental::info::device::
matrix_combinations>();

constexpr size_t MCache1 = 32;
constexpr size_t MCache2 = 256;
constexpr size_t NCache2 = 256;
constexpr size_t KCache2 = 32;

#ifdef VNNI
constexpr unsigned int VnniFactor = 2;
#else // VNNI
constexpr unsigned int VnniFactor = 1;
#endif // VNNI

for (unsigned int i = 0; i < combinations.size(); i++) {
if (combinations[i].nsize == 0) { // Intel AMX
constexpr size_t NCache1 = 32;
constexpr size_t KCache1 = 32;
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}

if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
constexpr size_t NCache1 = 4 * /*TN*/ 16;
constexpr size_t KCache1 = 16;
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#if (!defined(SG_SZ) || SG_SZ != 32)
// These combination are not currently supported for subgroup size = 32 in
// IGC
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#endif
break;
}

if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
constexpr size_t NCache1 = 4 * /*TN*/ 8;
constexpr size_t KCache1 = 16;

test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
// MCache1,
// NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}
}
return 0;
}

0 comments on commit a184cbc

Please sign in to comment.