From bb1f4e6fc44576689609f1b9d4c3dc17b427e15b Mon Sep 17 00:00:00 2001
From: mehmet yusufoglu <mehmetyusufoglu01@gmail.com>
Date: Thu, 18 Jul 2024 12:05:54 +0200
Subject: [PATCH] Add a matrix multiplication example using mdspan

---
 example/CMakeLists.txt                        |   1 +
 example/matrixMulWithMdspan/CMakeLists.txt    |  53 +++++
 .../src/matrixMulMdSpan.cpp                   | 211 ++++++++++++++++++
 3 files changed, 265 insertions(+)
 create mode 100644 example/matrixMulWithMdspan/CMakeLists.txt
 create mode 100644 example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index d69bafc3eb76..d981136dbb29 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
 add_subdirectory("kernelSpecialization/")
 add_subdirectory("ls/")
+add_subdirectory("matrixMulWithMdspan/")
 add_subdirectory("monteCarloIntegration/")
 add_subdirectory("openMPSchedule/")
 add_subdirectory("parallelLoopPatterns/")
diff --git a/example/matrixMulWithMdspan/CMakeLists.txt b/example/matrixMulWithMdspan/CMakeLists.txt
new file mode 100644
index 000000000000..7fa21254c08b
--- /dev/null
+++ b/example/matrixMulWithMdspan/CMakeLists.txt
@@ -0,0 +1,53 @@
+#
+# Copyright 2023 Simeon Ehrig, Mehmet Yusufoglu
+# SPDX-License-Identifier: MPL-2.0
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME matrixMulMdSpan)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+
+if (alpaka_USE_MDSPAN STREQUAL "OFF")
+    message(STATUS "The matrixMulMdSpan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.")
+    return()
+endif ()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/matrixMulMdSpan.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
new file mode 100644
index 000000000000..ea8db88157e7
--- /dev/null
+++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -0,0 +1,211 @@
+/* Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/alpaka.hpp>
+// Needed for running example for all backends available; one by one
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <experimental/mdspan>
+#include <iostream>
+
+//! Matrix multiplication example by using mdspan data structure
+
+//! Some simple type traits for checking the types
+//! isMdspan simply checks if a type is of type std::experimental::mdspan or not
+//! Primary template for is_mdspan (defaults to false)
+template<typename T>
+struct IsMdspan : std::false_type
+{
+};
+
+//! Specialization for mdspan with four template arguments
+template<typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+struct IsMdspan<std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>> : std::true_type
+{
+};
+
+template<typename T>
+inline constexpr bool is_mdspan = IsMdspan<T>::value;
+
+// Index type
+using Idx = std::size_t;
+// Set data type
+using DataType = float;
+
+/**
+ * @brief Kernel for performing multiplication of two 2D matrices. Each element is computed by a different thread.
+ * MdSpan data structure is used to pass the data to and from the kernel.
+ */
+struct MatrixMulKernel
+{
+    //! \tparam TAcc Accelerator type
+    //! \tparam MdSpan The type of the multidimensional span (mdspan)
+    //! \param acc Accelerator
+    //! \param A First input matrix
+    //! \param B Second input matrix
+    //! \param C Output matrix where the result of A * B will be stored
+    //! \param K The shared dimension between A and B
+    template<typename TAcc, typename TMdSpan>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const
+    {
+        // compile time checks
+        static_assert(is_mdspan<TMdSpan>, "The type TMdSpan should be an std mdspan");
+        static_assert(TMdSpan::rank() == 2);
+
+        // A is MxK and B is KxN
+        auto const K = static_cast<Idx>(A.extent(1));
+
+        auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];
+
+        if(i < C.extent(0) && j < C.extent(1))
+        {
+            DataType sum = 0.0f;
+            for(Idx k = 0; k < K; ++k)
+            {
+                sum += A(i, k) * B(k, j);
+            }
+            C(i, j) = sum;
+        }
+    }
+};
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Set number of dimensions (i.e 2) as a type
+    using Dim = alpaka::DimInt<2>;
+
+    // Define matrix dimensions, A is MxK and B is KxN
+    const Idx M = 1024;
+    const Idx N = 512;
+    const Idx K = 1024;
+
+    // Define device and queue
+    using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    Queue queue(devAcc);
+
+    // Define the 2D extents (dimensions)
+    Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K));
+    Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N));
+    Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N));
+
+    // Allocate host memory
+    auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA);
+    auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB);
+    auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC);
+
+    // Create mdspan view for bufHostA and bufHostB using alpaka::experimental::getMdSpan to fill the host buffers
+    auto mdHostA = alpaka::experimental::getMdSpan(bufHostA);
+    auto mdHostB = alpaka::experimental::getMdSpan(bufHostB);
+
+    // Initialize host matrices
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < K; ++j)
+        {
+            // fill with some data
+            mdHostA(i, j) = static_cast<DataType>(i * K + j);
+        }
+    }
+    for(Idx i = 0; i < K; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            // fill with some data
+            mdHostB(i, j) = static_cast<DataType>(i * N + j);
+        }
+    }
+
+    // Allocate device memory
+    auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA);
+    auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB);
+    auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC);
+
+    // Copy data to device, use directly host buffers (not mdspans used to fill the data)
+    alpaka::memcpy(queue, bufDevA, bufHostA);
+    alpaka::memcpy(queue, bufDevB, bufHostB);
+    alpaka::wait(queue);
+
+    // Create mdspan views for device buffers using alpaka::experimental::getMdSpan
+    auto mdDevA = alpaka::experimental::getMdSpan(bufDevA);
+    auto mdDevB = alpaka::experimental::getMdSpan(bufDevB);
+    auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
+
+    MatrixMulKernel kernel;
+    auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
+        devAcc,
+        bundeledKernel,
+        extentC,
+        Vec::ones(),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+
+    // Execute the kernel
+    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
+
+    // Copy result back to host
+    alpaka::memcpy(queue, bufHostC, bufDevC);
+    alpaka::wait(queue);
+
+    // Verify the result
+    bool success = true;
+    auto mdHostC = alpaka::experimental::getMdSpan(bufHostC);
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            DataType expectedValue = 0.0f;
+            for(Idx k = 0; k < K; ++k)
+            {
+                expectedValue += mdHostA(i, k) * mdHostB(k, j);
+            }
+            if(mdHostC(i, j) != expectedValue)
+            {
+                success = false;
+                break;
+            }
+        }
+    }
+
+    std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N << " using mdspan "
+              << (success ? "succeeded" : "failed") << "!" << std::endl;
+    if(!success)
+    {
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}