Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matrix Multiplication example using MdSpan #2317

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ add_subdirectory("helloWorld/")
add_subdirectory("helloWorldLambda/")
add_subdirectory("kernelSpecialization/")
add_subdirectory("ls/")
add_subdirectory("matrixMulWithMdspan/")
add_subdirectory("monteCarloIntegration/")
add_subdirectory("openMPSchedule/")
add_subdirectory("parallelLoopPatterns/")
Expand Down
53 changes: 53 additions & 0 deletions example/matrixMulWithMdspan/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#
# Copyright 2023 Simeon Ehrig, Mehmet Yusufoglu
# SPDX-License-Identifier: MPL-2.0
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.22)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME matrixMulMdSpan)

project(${_TARGET_NAME} LANGUAGES CXX)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()


if (alpaka_USE_MDSPAN STREQUAL "OFF")
message(STATUS "The matrixMulMdSpan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.")
return()
endif ()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/matrixMulMdSpan.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
211 changes: 211 additions & 0 deletions example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/* Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, Andrea Bocci
* SPDX-License-Identifier: MPL-2.0
*/

#include <alpaka/alpaka.hpp>
// Needed for running example for all backends available; one by one
#include <alpaka/example/ExecuteForEachAccTag.hpp>

#include <experimental/mdspan>
#include <iostream>

//! Matrix multiplication example by using mdspan data structure

//! Some simple type traits for checking the types
//! isMdspan simply checks if a type is of type std::experimental::mdspan or not
//! Primary template for is_mdspan (defaults to false)
template<typename T>
struct IsMdspan : std::false_type
{
};

//! Specialization for mdspan with four template arguments
template<typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
struct IsMdspan<std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>> : std::true_type
{
};

template<typename T>
inline constexpr bool is_mdspan = IsMdspan<T>::value;

// Index type
using Idx = std::size_t;
// Set data type
using DataType = float;

/**
* @brief Kernel for performing multiplication of two 2D matrices. Each element is computed by a different thread.
* MdSpan data structure is used to pass the data to and from the kernel.
*/
struct MatrixMulKernel
{
//! \tparam TAcc Accelerator type
//! \tparam MdSpan The type of the multidimensional span (mdspan)
//! \param acc Accelerator
//! \param A First input matrix
//! \param B Second input matrix
//! \param C Output matrix where the result of A * B will be stored
//! \param K The shared dimension between A and B
template<typename TAcc, typename TMdSpan>
ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const
{
// compile time checks
static_assert(is_mdspan<TMdSpan>, "The type TMdSpan should be an std mdspan");
static_assert(TMdSpan::rank() == 2);

// A is MxK and B is KxN
auto const K = static_cast<Idx>(A.extent(1));

auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];

if(i < C.extent(0) && j < C.extent(1))
{
DataType sum = 0.0f;
for(Idx k = 0; k < K; ++k)
{
sum += A(i, k) * B(k, j);
}
C(i, j) = sum;
}
}
};

// In standard projects, you typically do not execute the code with any available accelerator.
// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
// selected accelerator only. If you use the example as the starting point for your project, you can rename the
// example() function to main() and move the accelerator tag to the function body.
template<typename TAccTag>
auto example(TAccTag const&) -> int
{
// Set number of dimensions (i.e 2) as a type
using Dim = alpaka::DimInt<2>;

// Define matrix dimensions, A is MxK and B is KxN
const Idx M = 1024;
const Idx N = 512;
const Idx K = 1024;

// Define device and queue
using Acc = alpaka::AccCpuSerial<Dim, Idx>;
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
using Vec = alpaka::Vec<Dim, Idx>;

auto const platformHost = alpaka::PlatformCpu{};
auto const devHost = alpaka::getDevByIdx(platformHost, 0);
auto const platformAcc = alpaka::Platform<Acc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

Queue queue(devAcc);

// Define the 2D extents (dimensions)
Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K));
Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N));
Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N));

// Allocate host memory
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA);
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB);
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC);

// Create mdspan view for bufHostA and bufHostB using alpaka::experimental::getMdSpan to fill the host buffers
auto mdHostA = alpaka::experimental::getMdSpan(bufHostA);
auto mdHostB = alpaka::experimental::getMdSpan(bufHostB);

// Initialize host matrices
for(Idx i = 0; i < M; ++i)
{
for(Idx j = 0; j < K; ++j)
{
// fill with some data
mdHostA(i, j) = static_cast<DataType>(i * K + j);
}
}
for(Idx i = 0; i < K; ++i)
{
for(Idx j = 0; j < N; ++j)
{
// fill with some data
mdHostB(i, j) = static_cast<DataType>(i * N + j);
}
}

// Allocate device memory
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA);
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB);
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC);

// Copy data to device, use directly host buffers (not mdspans used to fill the data)
alpaka::memcpy(queue, bufDevA, bufHostA);
alpaka::memcpy(queue, bufDevB, bufHostB);
alpaka::wait(queue);

// Create mdspan views for device buffers using alpaka::experimental::getMdSpan
auto mdDevA = alpaka::experimental::getMdSpan(bufDevA);
auto mdDevB = alpaka::experimental::getMdSpan(bufDevB);
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);

MatrixMulKernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
extentC,
Vec::ones(),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);

// Execute the kernel
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
alpaka::wait(queue);

// Verify the result
bool success = true;
auto mdHostC = alpaka::experimental::getMdSpan(bufHostC);
for(Idx i = 0; i < M; ++i)
{
for(Idx j = 0; j < N; ++j)
{
DataType expectedValue = 0.0f;
for(Idx k = 0; k < K; ++k)
{
expectedValue += mdHostA(i, k) * mdHostB(k, j);
}
if(mdHostC(i, j) != expectedValue)
{
success = false;
break;
}
}
}

std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N << " using mdspan "
<< (success ? "succeeded" : "failed") << "!" << std::endl;
if(!success)
{
return EXIT_FAILURE;
}

return EXIT_SUCCESS;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if success is false you should end with EXIT_FAILURE

Copy link
Contributor Author

@mehmetyusufoglu mehmetyusufoglu Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, added. Thanks a lot. (Tested by ctest for both true-positive, true-negative cases. ) Other examples like convolution1d,2d,2dwithMdSpan are correct in this sense.

}

auto main() -> int
{
// Execute the example once for each enabled accelerator.
// If you would like to execute it for a single accelerator only you can use the following code.
// \code{.cpp}
// auto tag = TagCpuSerial;
// return example(tag);
// \endcode
//
// valid tags:
// TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
// TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
// TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
}
Loading