From bb1f4e6fc44576689609f1b9d4c3dc17b427e15b Mon Sep 17 00:00:00 2001 From: mehmet yusufoglu Date: Thu, 18 Jul 2024 12:05:54 +0200 Subject: [PATCH] Add a matrix multiplication example using mdspan --- example/CMakeLists.txt | 1 + example/matrixMulWithMdspan/CMakeLists.txt | 53 +++++ .../src/matrixMulMdSpan.cpp | 211 ++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 example/matrixMulWithMdspan/CMakeLists.txt create mode 100644 example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index d69bafc3eb76..d981136dbb29 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -26,6 +26,7 @@ add_subdirectory("helloWorld/") add_subdirectory("helloWorldLambda/") add_subdirectory("kernelSpecialization/") add_subdirectory("ls/") +add_subdirectory("matrixMulWithMdspan/") add_subdirectory("monteCarloIntegration/") add_subdirectory("openMPSchedule/") add_subdirectory("parallelLoopPatterns/") diff --git a/example/matrixMulWithMdspan/CMakeLists.txt b/example/matrixMulWithMdspan/CMakeLists.txt new file mode 100644 index 000000000000..7fa21254c08b --- /dev/null +++ b/example/matrixMulWithMdspan/CMakeLists.txt @@ -0,0 +1,53 @@ +# +# Copyright 2023 Simeon Ehrig, Mehmet Yusufoglu +# SPDX-License-Identifier: MPL-2.0 +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.22) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME matrixMulMdSpan) + +project(${_TARGET_NAME} LANGUAGES CXX) + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + + +if (alpaka_USE_MDSPAN STREQUAL "OFF") + message(STATUS "The matrixMulMdSpan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.") + return() +endif () + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/matrixMulMdSpan.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp new file mode 100644 index 000000000000..ea8db88157e7 --- /dev/null +++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp @@ -0,0 +1,211 @@ +/* Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, Andrea Bocci + * SPDX-License-Identifier: MPL-2.0 + */ + +#include +// Needed for running example for all backends available; one by one +#include + +#include +#include + +//! Matrix multiplication example by using mdspan data structure + +//! Some simple type traits for checking the types +//! isMdspan simply checks if a type is of type std::experimental::mdspan or not +//! Primary template for is_mdspan (defaults to false) +template +struct IsMdspan : std::false_type +{ +}; + +//! Specialization for mdspan with four template arguments +template +struct IsMdspan> : std::true_type +{ +}; + +template +inline constexpr bool is_mdspan = IsMdspan::value; + +// Index type +using Idx = std::size_t; +// Set data type +using DataType = float; + +/** + * @brief Kernel for performing multiplication of two 2D matrices. Each element is computed by a different thread. + * MdSpan data structure is used to pass the data to and from the kernel. + */ +struct MatrixMulKernel +{ + //! \tparam TAcc Accelerator type + //! \tparam MdSpan The type of the multidimensional span (mdspan) + //! \param acc Accelerator + //! \param A First input matrix + //! \param B Second input matrix + //! \param C Output matrix where the result of A * B will be stored + //! \param K The shared dimension between A and B + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const + { + // compile time checks + static_assert(is_mdspan, "The type TMdSpan should be an std mdspan"); + static_assert(TMdSpan::rank() == 2); + + // A is MxK and B is KxN + auto const K = static_cast(A.extent(1)); + + auto const i = alpaka::getIdx(acc)[0]; + auto const j = alpaka::getIdx(acc)[1]; + + if(i < C.extent(0) && j < C.extent(1)) + { + DataType sum = 0.0f; + for(Idx k = 0; k < K; ++k) + { + sum += A(i, k) * B(k, j); + } + C(i, j) = sum; + } + } +}; + +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int +{ + // Set number of dimensions (i.e 2) as a type + using Dim = alpaka::DimInt<2>; + + // Define matrix dimensions, A is MxK and B is KxN + const Idx M = 1024; + const Idx N = 512; + const Idx K = 1024; + + // Define device and queue + using Acc = alpaka::AccCpuSerial; + using Queue = alpaka::Queue; + using Vec = alpaka::Vec; + + auto const platformHost = alpaka::PlatformCpu{}; + auto const devHost = alpaka::getDevByIdx(platformHost, 0); + auto const platformAcc = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + Queue queue(devAcc); + + // Define the 2D extents (dimensions) + Vec const extentA(static_cast(M), static_cast(K)); + Vec const extentB(static_cast(K), static_cast(N)); + Vec const extentC(static_cast(M), static_cast(N)); + + // Allocate host memory + auto bufHostA = alpaka::allocBuf(devHost, extentA); + auto bufHostB = alpaka::allocBuf(devHost, extentB); + auto bufHostC = alpaka::allocBuf(devHost, extentC); + + // Create mdspan view for bufHostA and bufHostB using alpaka::experimental::getMdSpan to fill the host buffers + auto mdHostA = alpaka::experimental::getMdSpan(bufHostA); + auto mdHostB = alpaka::experimental::getMdSpan(bufHostB); + + // Initialize host matrices + for(Idx i = 0; i < M; ++i) + { + for(Idx j = 0; j < K; ++j) + { + // fill with some data + mdHostA(i, j) = static_cast(i * K + j); + } + } + for(Idx i = 0; i < K; ++i) + { + for(Idx j = 0; j < N; ++j) + { + // fill with some data + mdHostB(i, j) = static_cast(i * N + j); + } + } + + // Allocate device memory + auto bufDevA = alpaka::allocBuf(devAcc, extentA); + auto bufDevB = alpaka::allocBuf(devAcc, extentB); + auto bufDevC = alpaka::allocBuf(devAcc, extentC); + + // Copy data to device, use directly host buffers (not mdspans used to fill the data) + alpaka::memcpy(queue, bufDevA, bufHostA); + alpaka::memcpy(queue, bufDevB, bufHostB); + alpaka::wait(queue); + + // Create mdspan views for device buffers using alpaka::experimental::getMdSpan + auto mdDevA = alpaka::experimental::getMdSpan(bufDevA); + auto mdDevB = alpaka::experimental::getMdSpan(bufDevB); + auto mdDevC = alpaka::experimental::getMdSpan(bufDevC); + + MatrixMulKernel kernel; + auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC); + + // Let alpaka calculate good block and grid sizes given our full problem extent + auto const workDiv = alpaka::getValidWorkDivForKernel( + devAcc, + bundeledKernel, + extentC, + Vec::ones(), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + + // Execute the kernel + alpaka::exec(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC); + + // Copy result back to host + alpaka::memcpy(queue, bufHostC, bufDevC); + alpaka::wait(queue); + + // Verify the result + bool success = true; + auto mdHostC = alpaka::experimental::getMdSpan(bufHostC); + for(Idx i = 0; i < M; ++i) + { + for(Idx j = 0; j < N; ++j) + { + DataType expectedValue = 0.0f; + for(Idx k = 0; k < K; ++k) + { + expectedValue += mdHostA(i, k) * mdHostB(k, j); + } + if(mdHostC(i, j) != expectedValue) + { + success = false; + break; + } + } + } + + std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N << " using mdspan " + << (success ? "succeeded" : "failed") << "!" << std::endl; + if(!success) + { + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +}