-
Notifications
You must be signed in to change notification settings - Fork 730
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add test for runtime matrix dimension
- Loading branch information
1 parent
b56a34c
commit a184cbc
Showing
2 changed files
with
271 additions
and
0 deletions.
There are no files selected for viewing
22 changes: 22 additions & 0 deletions
22
sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// REQUIRES: aspect-ext_intel_matrix | ||
|
||
// https://jira.devtools.intel.com/browse/GSD-9716 | ||
// XFAIL: arch-intel_gpu_pvc | ||
|
||
// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI | ||
// RUN: %{run} %t_runtime_dim_vnni.out | ||
|
||
// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise | ||
// RUN: %{run} %t_runtime_dim.out | ||
|
||
// -ffp-model=precise is added to not depend on compiler defaults. | ||
|
||
#include "common.hpp" | ||
#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" |
249 changes: 249 additions & 0 deletions
249
sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
//------------------------------------------------------------------------------==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===-------------------------------------------------------------------------===// | ||
|
||
#include <random> | ||
#include <sycl/usm.hpp> | ||
|
||
// number of test iterations | ||
constexpr unsigned int testIterations = 100; | ||
// start recording time after X iterations | ||
constexpr unsigned int recordThresh = 10; | ||
|
||
template <size_t TM, size_t TN, size_t TK> class MatMul; | ||
|
||
template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM, | ||
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1, | ||
size_t MCache2, size_t NCache2, size_t KCache2> | ||
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, | ||
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { | ||
|
||
size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q); | ||
range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; | ||
range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; | ||
|
||
// throw error if padding needed | ||
assert(colsA == rowsB); | ||
assert(rowsA % TM == 0); | ||
assert(colsA % TK == 0); | ||
assert(colsB % TN == 0); | ||
// submit main kernel | ||
std::chrono::high_resolution_clock::time_point start = | ||
std::chrono::high_resolution_clock::now(); | ||
|
||
q.submit([&](handler &h) { | ||
h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1 | ||
nd_range<2>{global, cachelocal}, | ||
// loop global | ||
// loop localrange | ||
[=](nd_item<2> it) | ||
{ | ||
// sg::load and sg::store expect decorations to be ON | ||
auto pA = | ||
address_space_cast<sycl::access::address_space::global_space, | ||
sycl::access::decorated::yes>(A); | ||
auto pB = | ||
address_space_cast<sycl::access::address_space::global_space, | ||
sycl::access::decorated::yes>(B); | ||
auto pC = | ||
address_space_cast<sycl::access::address_space::global_space, | ||
sycl::access::decorated::yes>(C); | ||
auto m2 = it.get_group(0); | ||
auto n2 = it.get_group(1); | ||
auto m1 = it.get_local_id(0); | ||
auto n1 = it.get_local_id(1) / sgSize; | ||
auto sg = it.get_sub_group(); | ||
|
||
joint_matrix<sub_group, TResult, use::accumulator, TM, TN> | ||
tC[MCache1 / TM][NCache1 / TN]; | ||
|
||
for (unsigned int m = 0; m < MCache1 / TM; m++) { | ||
for (unsigned int n = 0; n < NCache1 / TN; n++) { | ||
joint_matrix_fill(sg, tC[m][n], 0); | ||
} | ||
} | ||
|
||
for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { | ||
joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major> | ||
tA[MCache1 / TM][KCache2 / KCache1]; | ||
#ifdef VNNI | ||
joint_matrix<sub_group, TOperand, use::b, TK, TN, | ||
layout::ext_intel_packed> | ||
tB[NCache1 / TN][KCache2 / KCache1]; | ||
#else // VNNI | ||
joint_matrix<sub_group, TOperand, use::b, TK, TN, | ||
layout::row_major> | ||
tB[NCache1 / TN][KCache2 / KCache1]; | ||
#endif // VNNI | ||
|
||
for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { | ||
unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; | ||
for (unsigned int m = 0; m < MCache1 / TM; m++) { | ||
joint_matrix_load( | ||
sg, tA[m][k1], | ||
pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + | ||
k * TK, | ||
colsA); | ||
} | ||
for (unsigned int n = 0; n < NCache1 / TN; n++) { | ||
joint_matrix_load( | ||
sg, tB[n][k1], | ||
pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + | ||
(n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, | ||
colsB * vnniFactor); | ||
} // n | ||
for (unsigned int m = 0; m < MCache1 / TM; m++) { | ||
for (unsigned int n = 0; n < NCache1 / TN; n++) { | ||
joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], | ||
tC[m][n]); | ||
} // n | ||
} // m | ||
} // k1 | ||
} // for k2 | ||
|
||
for (unsigned int m = 0; m < MCache1 / TM; m++) { | ||
for (unsigned int n = 0; n < NCache1 / TN; n++) { | ||
joint_matrix_store( | ||
sg, tC[m][n], | ||
pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + | ||
(n2 * NCache2 + n1 * NCache1 + n * TN), | ||
colsB, layout::row_major); | ||
} // n | ||
} // m | ||
}); // parallel_for | ||
}); // queue.submit | ||
|
||
if (i == testIterations - 1) | ||
q.wait(); | ||
std::chrono::duration<double, std::milli> duration = | ||
std::chrono::high_resolution_clock::now() - start; | ||
|
||
return duration.count(); | ||
} | ||
|
||
template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN, | ||
size_t TK, size_t MCache1, size_t NCache1, size_t KCache1, | ||
size_t MCache2, size_t NCache2, size_t KCache2> | ||
void test(size_t matrix_size) { | ||
assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && | ||
"invalid matrix size"); | ||
assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && | ||
(matrix_size % TK) == 0 && | ||
"invalid matrix size detected: not a multiple of <TM,TN,TK>"); | ||
|
||
std::cout << "Testing: " << TM << " x " << TN << " x " << TK | ||
<< " [TM x TN x TK]" << std::endl; | ||
|
||
queue q; | ||
T *A = malloc_shared<T>(matrix_size * matrix_size, q); | ||
T *B = malloc_shared<T>(matrix_size * matrix_size, q); | ||
TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q); | ||
TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q); | ||
|
||
matrix_rand<T>(matrix_size, matrix_size, A, T(1)); | ||
matrix_rand<T>(matrix_size, matrix_size, B, T(1)); | ||
|
||
matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size, | ||
matrix_size); | ||
|
||
#ifdef VNNI | ||
T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q); | ||
matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor); | ||
free(B, q); | ||
B = vnniB; | ||
#endif | ||
|
||
// run testIterations time, aggregate and calculate average run time | ||
double totalDuration = 0; | ||
for (unsigned int i = 0; i < testIterations; i++) { | ||
|
||
double duration = | ||
joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1, | ||
KCache1, MCache2, NCache2, KCache2> | ||
(A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size); | ||
|
||
if (i >= recordThresh) { | ||
totalDuration += duration; | ||
} | ||
} | ||
|
||
assert(matrix_compare(matrix_size, matrix_size, C, refC)); | ||
|
||
double msecPerMatrixMul = | ||
totalDuration / static_cast<double>(testIterations - recordThresh); | ||
double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / | ||
(msecPerMatrixMul / 1000.f); | ||
|
||
std::cout << "DONE for size " << matrix_size << std::endl; | ||
std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; | ||
|
||
free(A, q); | ||
free(B, q); | ||
free(C, q); | ||
free(refC, q); | ||
} | ||
|
||
int main(int argc, char *argv[]) { | ||
size_t matrix_size; | ||
matrix_size = std::stoul(argv[1]); | ||
|
||
queue q; | ||
std::vector<combination> combinations = | ||
q.get_device() | ||
.get_info<sycl::ext::oneapi::experimental::info::device:: | ||
matrix_combinations>(); | ||
|
||
constexpr size_t MCache1 = 32; | ||
constexpr size_t MCache2 = 256; | ||
constexpr size_t NCache2 = 256; | ||
constexpr size_t KCache2 = 32; | ||
|
||
#ifdef VNNI | ||
constexpr unsigned int VnniFactor = 2; | ||
#else // VNNI | ||
constexpr unsigned int VnniFactor = 1; | ||
#endif // VNNI | ||
|
||
for (unsigned int i = 0; i < combinations.size(); i++) { | ||
if (combinations[i].nsize == 0) { // Intel AMX | ||
constexpr size_t NCache1 = 32; | ||
constexpr size_t KCache1 = 32; | ||
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, | ||
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
break; | ||
} | ||
|
||
if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc | ||
constexpr size_t NCache1 = 4 * /*TN*/ 16; | ||
constexpr size_t KCache1 = 16; | ||
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, | ||
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
#if (!defined(SG_SZ) || SG_SZ != 32) | ||
// These combination are not currently supported for subgroup size = 32 in | ||
// IGC | ||
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, | ||
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, | ||
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
#endif | ||
break; | ||
} | ||
|
||
if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* | ||
constexpr size_t NCache1 = 4 * /*TN*/ 8; | ||
constexpr size_t KCache1 = 16; | ||
|
||
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, | ||
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, | ||
// MCache1, | ||
// NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); | ||
break; | ||
} | ||
} | ||
return 0; | ||
} |