From a184cbc9cc0dcfe82c7ed1e2ad0e35ccbdd97a87 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Thu, 19 Sep 2024 15:07:08 -0700 Subject: [PATCH] add test for runtime matrix dimension --- ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp | 22 ++ ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 249 ++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp new file mode 100644 index 0000000000000..30ba7db3d227e --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -0,0 +1,22 @@ +//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// https://jira.devtools.intel.com/browse/GSD-9716 +// XFAIL: arch-intel_gpu_pvc + +// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI +// RUN: %{run} %t_runtime_dim_vnni.out + +// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise +// RUN: %{run} %t_runtime_dim.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp new file mode 100644 index 0000000000000..cbfb4de8f18aa --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp @@ -0,0 +1,249 @@ +//------------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +#include +#include + +// number of test iterations +constexpr unsigned int testIterations = 100; +// start recording time after X iterations +constexpr unsigned int recordThresh = 10; + +template class MatMul; + +template +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, + size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { + + size_t sgSize = get_sg_size>(q); + range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; + range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; + + // throw error if padding needed + assert(colsA == rowsB); + assert(rowsA % TM == 0); + assert(colsA % TK == 0); + assert(colsB % TN == 0); + // submit main kernel + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + + q.submit([&](handler &h) { + h.parallel_for>( // cache layer#1 + nd_range<2>{global, cachelocal}, + // loop global + // loop localrange + [=](nd_item<2> it) + { + // sg::load and sg::store expect decorations to be ON + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + auto m2 = it.get_group(0); + auto n2 = it.get_group(1); + auto m1 = it.get_local_id(0); + auto n1 = it.get_local_id(1) / sgSize; + auto sg = it.get_sub_group(); + + joint_matrix + tC[MCache1 / TM][NCache1 / TN]; + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_fill(sg, tC[m][n], 0); + } + } + + for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { + joint_matrix + tA[MCache1 / TM][KCache2 / KCache1]; +#ifdef VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1]; +#else // VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1]; +#endif // VNNI + + for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { + unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; + for (unsigned int m = 0; m < MCache1 / TM; m++) { + joint_matrix_load( + sg, tA[m][k1], + pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + + k * TK, + colsA); + } + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_load( + sg, tB[n][k1], + pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, + colsB * vnniFactor); + } // n + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], + tC[m][n]); + } // n + } // m + } // k1 + } // for k2 + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_store( + sg, tC[m][n], + pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + + (n2 * NCache2 + n1 * NCache1 + n * TN), + colsB, layout::row_major); + } // n + } // m + }); // parallel_for + }); // queue.submit + + if (i == testIterations - 1) + q.wait(); + std::chrono::duration duration = + std::chrono::high_resolution_clock::now() - start; + + return duration.count(); +} + +template +void test(size_t matrix_size) { + assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && + "invalid matrix size"); + assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && + (matrix_size % TK) == 0 && + "invalid matrix size detected: not a multiple of "); + + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + queue q; + T *A = malloc_shared(matrix_size * matrix_size, q); + T *B = malloc_shared(matrix_size * matrix_size, q); + TResult *C = malloc_shared(matrix_size * matrix_size, q); + TResult *refC = malloc_shared(matrix_size * matrix_size, q); + + matrix_rand(matrix_size, matrix_size, A, T(1)); + matrix_rand(matrix_size, matrix_size, B, T(1)); + + matrix_multiply_ref(A, B, refC, matrix_size, matrix_size, + matrix_size); + +#ifdef VNNI + T *vnniB = malloc_shared(matrix_size * matrix_size, q); + matrix_vnni(matrix_size, matrix_size, B, vnniB, vnniFactor); + free(B, q); + B = vnniB; +#endif + + // run testIterations time, aggregate and calculate average run time + double totalDuration = 0; + for (unsigned int i = 0; i < testIterations; i++) { + + double duration = + joint_matmul + (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size); + + if (i >= recordThresh) { + totalDuration += duration; + } + } + + assert(matrix_compare(matrix_size, matrix_size, C, refC)); + + double msecPerMatrixMul = + totalDuration / static_cast(testIterations - recordThresh); + double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / + (msecPerMatrixMul / 1000.f); + + std::cout << "DONE for size " << matrix_size << std::endl; + std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; + + free(A, q); + free(B, q); + free(C, q); + free(refC, q); +} + +int main(int argc, char *argv[]) { + size_t matrix_size; + matrix_size = std::stoul(argv[1]); + + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + constexpr size_t MCache1 = 32; + constexpr size_t MCache2 = 256; + constexpr size_t NCache2 = 256; + constexpr size_t KCache2 = 32; + +#ifdef VNNI + constexpr unsigned int VnniFactor = 2; +#else // VNNI + constexpr unsigned int VnniFactor = 1; +#endif // VNNI + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + constexpr size_t NCache1 = 32; + constexpr size_t KCache1 = 32; + test(matrix_size); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + constexpr size_t NCache1 = 4 * /*TN*/ 16; + constexpr size_t KCache1 = 16; + test(matrix_size); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + test(matrix_size); + test(matrix_size); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + constexpr size_t NCache1 = 4 * /*TN*/ 8; + constexpr size_t KCache1 = 16; + + test(matrix_size); + // test(matrix_size); + break; + } + } + return 0; +}