add test for runtime matrix dimension

intel · Sep 19, 2024 · a184cbc · a184cbc
1 parent b56a34c
commit a184cbc
Show file tree

Hide file tree

Showing 2 changed files with 271 additions and 0 deletions.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -0,0 +1,22 @@
+//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// https://jira.devtools.intel.com/browse/GSD-9716
+// XFAIL: arch-intel_gpu_pvc
+
+// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
+// RUN: %{run} %t_runtime_dim_vnni.out
+
+// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
+// RUN: %{run} %t_runtime_dim.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
@@ -0,0 +1,249 @@
+//------------------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------===//
+
+#include <random>
+#include <sycl/usm.hpp>
+
+// number of test iterations
+constexpr unsigned int testIterations = 100;
+// start recording time after X iterations
+constexpr unsigned int recordThresh = 10;
+
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+ size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+ size_t MCache2, size_t NCache2, size_t KCache2>
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, 
+ size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+
+ size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+ range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+ range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
+
+ // throw error if padding needed
+ assert(colsA == rowsB);
+ assert(rowsA % TM == 0);
+ assert(colsA % TK == 0);
+ assert(colsB % TN == 0);
+ // submit main kernel
+ std::chrono::high_resolution_clock::time_point start =
+ std::chrono::high_resolution_clock::now();
+
+ q.submit([&](handler &h) {
+ h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
+ nd_range<2>{global, cachelocal},
+ // loop global
+ // loop localrange
+ [=](nd_item<2> it)
+ {
+ // sg::load and sg::store expect decorations to be ON
+ auto pA =
+ address_space_cast<sycl::access::address_space::global_space,
+ sycl::access::decorated::yes>(A);
+ auto pB =
+ address_space_cast<sycl::access::address_space::global_space,
+ sycl::access::decorated::yes>(B);
+ auto pC =
+ address_space_cast<sycl::access::address_space::global_space,
+ sycl::access::decorated::yes>(C);
+ auto m2 = it.get_group(0);
+ auto n2 = it.get_group(1);
+ auto m1 = it.get_local_id(0);
+ auto n1 = it.get_local_id(1) / sgSize;
+ auto sg = it.get_sub_group();
+
+ joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+ tC[MCache1 / TM][NCache1 / TN];
+
+ for (unsigned int m = 0; m < MCache1 / TM; m++) {
+ for (unsigned int n = 0; n < NCache1 / TN; n++) {
+ joint_matrix_fill(sg, tC[m][n], 0);
+ }
+ }
+
+ for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+ joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+ tA[MCache1 / TM][KCache2 / KCache1];
+#ifdef VNNI
+ joint_matrix<sub_group, TOperand, use::b, TK, TN,
+ layout::ext_intel_packed>
+ tB[NCache1 / TN][KCache2 / KCache1];
+#else // VNNI
+ joint_matrix<sub_group, TOperand, use::b, TK, TN,
+ layout::row_major>
+ tB[NCache1 / TN][KCache2 / KCache1];
+#endif // VNNI
+
+ for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
+ unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
+ for (unsigned int m = 0; m < MCache1 / TM; m++) {
+ joint_matrix_load(
+ sg, tA[m][k1],
+ pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+ k * TK,
+ colsA);
+ }
+ for (unsigned int n = 0; n < NCache1 / TN; n++) {
+ joint_matrix_load(
+ sg, tB[n][k1],
+ pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
+ (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
+ colsB * vnniFactor);
+ } // n
+ for (unsigned int m = 0; m < MCache1 / TM; m++) {
+ for (unsigned int n = 0; n < NCache1 / TN; n++) {
+ joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
+ tC[m][n]);
+ } // n
+ } // m
+ } // k1
+ } // for k2
+
+ for (unsigned int m = 0; m < MCache1 / TM; m++) {
+ for (unsigned int n = 0; n < NCache1 / TN; n++) {
+ joint_matrix_store(
+ sg, tC[m][n],
+ pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+ (n2 * NCache2 + n1 * NCache1 + n * TN),
+ colsB, layout::row_major);
+ } // n
+ } // m
+ }); // parallel_for
+ }); // queue.submit
+
+ if (i == testIterations - 1)
+ q.wait();
+ std::chrono::duration<double, std::milli> duration =
+ std::chrono::high_resolution_clock::now() - start;
+
+ return duration.count();
+}
+
+template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
+ size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+ size_t MCache2, size_t NCache2, size_t KCache2>
+void test(size_t matrix_size) {
+ assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
+ "invalid matrix size");
+ assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
+ (matrix_size % TK) == 0 &&
+ "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+ std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+ << " [TM x TN x TK]" << std::endl;
+
+ queue q;
+ T *A = malloc_shared<T>(matrix_size * matrix_size, q);
+ T *B = malloc_shared<T>(matrix_size * matrix_size, q);
+ TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
+ TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);
+
+ matrix_rand<T>(matrix_size, matrix_size, A, T(1));
+ matrix_rand<T>(matrix_size, matrix_size, B, T(1));
+
+ matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
+ matrix_size);
+
+#ifdef VNNI
+ T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
+ matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
+ free(B, q);
+ B = vnniB;
+#endif
+
+ // run testIterations time, aggregate and calculate average run time
+ double totalDuration = 0;
+ for (unsigned int i = 0; i < testIterations; i++) {
+
+ double duration =
+ joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+ KCache1, MCache2, NCache2, KCache2>
+ (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size);
+
+ if (i >= recordThresh) {
+ totalDuration += duration;
+ }
+ }
+
+ assert(matrix_compare(matrix_size, matrix_size, C, refC));
+
+ double msecPerMatrixMul =
+ totalDuration / static_cast<double>(testIterations - recordThresh);
+ double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
+ (msecPerMatrixMul / 1000.f);
+
+ std::cout << "DONE for size " << matrix_size << std::endl;
+ std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
+
+ free(A, q);
+ free(B, q);
+ free(C, q);
+ free(refC, q);
+}
+
+int main(int argc, char *argv[]) {
+ size_t matrix_size;
+ matrix_size = std::stoul(argv[1]);
+
+ queue q;
+ std::vector<combination> combinations =
+ q.get_device()
+ .get_info<sycl::ext::oneapi::experimental::info::device::
+ matrix_combinations>();
+
+ constexpr size_t MCache1 = 32;
+ constexpr size_t MCache2 = 256;
+ constexpr size_t NCache2 = 256;
+ constexpr size_t KCache2 = 32;
+
+#ifdef VNNI
+ constexpr unsigned int VnniFactor = 2;
+#else // VNNI
+ constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
+ for (unsigned int i = 0; i < combinations.size(); i++) {
+ if (combinations[i].nsize == 0) { // Intel AMX
+ constexpr size_t NCache1 = 32;
+ constexpr size_t KCache1 = 32;
+ test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+ break;
+ }
+
+ if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+ constexpr size_t NCache1 = 4 * /*TN*/ 16;
+ constexpr size_t KCache1 = 16;
+ test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+ NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#if (!defined(SG_SZ) || SG_SZ != 32)
+ // These combination are not currently supported for subgroup size = 32 in
+ // IGC
+ test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+ test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+ MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#endif
+ break;
+ }
+
+ if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+ constexpr size_t NCache1 = 4 * /*TN*/ 8;
+ constexpr size_t KCache1 = 16;
+
+ test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+ NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+ // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+ // MCache1,
+ // NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+ break;
+ }
+ }
+ return 0;
+}