-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Matrix Multiplication example using MdSpan #2317
Merged
psychocoderHPC
merged 1 commit into
alpaka-group:develop
from
mehmetyusufoglu:matrixExamplesMdSpan
Jul 26, 2024
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# | ||
# Copyright 2023 Simeon Ehrig, Mehmet Yusufoglu | ||
# SPDX-License-Identifier: MPL-2.0 | ||
# | ||
|
||
################################################################################ | ||
# Required CMake version. | ||
|
||
cmake_minimum_required(VERSION 3.22) | ||
|
||
set_property(GLOBAL PROPERTY USE_FOLDERS ON) | ||
|
||
################################################################################ | ||
# Project. | ||
|
||
set(_TARGET_NAME matrixMulMdSpan) | ||
|
||
project(${_TARGET_NAME} LANGUAGES CXX) | ||
|
||
#------------------------------------------------------------------------------- | ||
# Find alpaka. | ||
|
||
if(NOT TARGET alpaka::alpaka) | ||
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) | ||
|
||
if(alpaka_USE_SOURCE_TREE) | ||
# Don't build the examples recursively | ||
set(alpaka_BUILD_EXAMPLES OFF) | ||
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") | ||
else() | ||
find_package(alpaka REQUIRED) | ||
endif() | ||
endif() | ||
|
||
|
||
if (alpaka_USE_MDSPAN STREQUAL "OFF") | ||
message(STATUS "The matrixMulMdSpan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.") | ||
return() | ||
endif () | ||
|
||
#------------------------------------------------------------------------------- | ||
# Add executable. | ||
|
||
alpaka_add_executable( | ||
${_TARGET_NAME} | ||
src/matrixMulMdSpan.cpp) | ||
target_link_libraries( | ||
${_TARGET_NAME} | ||
PUBLIC alpaka::alpaka) | ||
|
||
set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) | ||
|
||
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
/* Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, Andrea Bocci | ||
* SPDX-License-Identifier: MPL-2.0 | ||
*/ | ||
|
||
#include <alpaka/alpaka.hpp> | ||
// Needed for running example for all backends available; one by one | ||
#include <alpaka/example/ExecuteForEachAccTag.hpp> | ||
|
||
#include <experimental/mdspan> | ||
#include <iostream> | ||
|
||
//! Matrix multiplication example by using mdspan data structure | ||
|
||
//! Some simple type traits for checking the types | ||
//! isMdspan simply checks if a type is of type std::experimental::mdspan or not | ||
//! Primary template for is_mdspan (defaults to false) | ||
template<typename T> | ||
struct IsMdspan : std::false_type | ||
{ | ||
}; | ||
|
||
//! Specialization for mdspan with four template arguments | ||
template<typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy> | ||
struct IsMdspan<std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>> : std::true_type | ||
{ | ||
}; | ||
|
||
template<typename T> | ||
inline constexpr bool is_mdspan = IsMdspan<T>::value; | ||
|
||
// Index type | ||
using Idx = std::size_t; | ||
// Set data type | ||
using DataType = float; | ||
|
||
/** | ||
* @brief Kernel for performing multiplication of two 2D matrices. Each element is computed by a different thread. | ||
* MdSpan data structure is used to pass the data to and from the kernel. | ||
*/ | ||
struct MatrixMulKernel | ||
{ | ||
//! \tparam TAcc Accelerator type | ||
//! \tparam MdSpan The type of the multidimensional span (mdspan) | ||
//! \param acc Accelerator | ||
//! \param A First input matrix | ||
//! \param B Second input matrix | ||
//! \param C Output matrix where the result of A * B will be stored | ||
//! \param K The shared dimension between A and B | ||
template<typename TAcc, typename TMdSpan> | ||
ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const | ||
{ | ||
// compile time checks | ||
static_assert(is_mdspan<TMdSpan>, "The type TMdSpan should be an std mdspan"); | ||
static_assert(TMdSpan::rank() == 2); | ||
|
||
// A is MxK and B is KxN | ||
auto const K = static_cast<Idx>(A.extent(1)); | ||
|
||
auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]; | ||
auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1]; | ||
|
||
if(i < C.extent(0) && j < C.extent(1)) | ||
{ | ||
DataType sum = 0.0f; | ||
for(Idx k = 0; k < K; ++k) | ||
{ | ||
sum += A(i, k) * B(k, j); | ||
} | ||
C(i, j) = sum; | ||
} | ||
} | ||
}; | ||
|
||
// In standard projects, you typically do not execute the code with any available accelerator. | ||
// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the | ||
// selected accelerator only. If you use the example as the starting point for your project, you can rename the | ||
// example() function to main() and move the accelerator tag to the function body. | ||
template<typename TAccTag> | ||
auto example(TAccTag const&) -> int | ||
{ | ||
// Set number of dimensions (i.e 2) as a type | ||
using Dim = alpaka::DimInt<2>; | ||
|
||
// Define matrix dimensions, A is MxK and B is KxN | ||
const Idx M = 1024; | ||
const Idx N = 512; | ||
const Idx K = 1024; | ||
|
||
// Define device and queue | ||
using Acc = alpaka::AccCpuSerial<Dim, Idx>; | ||
using Queue = alpaka::Queue<Acc, alpaka::Blocking>; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
|
||
auto const platformHost = alpaka::PlatformCpu{}; | ||
auto const devHost = alpaka::getDevByIdx(platformHost, 0); | ||
auto const platformAcc = alpaka::Platform<Acc>{}; | ||
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); | ||
|
||
Queue queue(devAcc); | ||
|
||
// Define the 2D extents (dimensions) | ||
Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K)); | ||
Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N)); | ||
Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N)); | ||
|
||
// Allocate host memory | ||
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA); | ||
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB); | ||
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC); | ||
|
||
// Create mdspan view for bufHostA and bufHostB using alpaka::experimental::getMdSpan to fill the host buffers | ||
auto mdHostA = alpaka::experimental::getMdSpan(bufHostA); | ||
auto mdHostB = alpaka::experimental::getMdSpan(bufHostB); | ||
|
||
// Initialize host matrices | ||
for(Idx i = 0; i < M; ++i) | ||
{ | ||
for(Idx j = 0; j < K; ++j) | ||
{ | ||
// fill with some data | ||
mdHostA(i, j) = static_cast<DataType>(i * K + j); | ||
} | ||
} | ||
for(Idx i = 0; i < K; ++i) | ||
{ | ||
for(Idx j = 0; j < N; ++j) | ||
{ | ||
// fill with some data | ||
mdHostB(i, j) = static_cast<DataType>(i * N + j); | ||
} | ||
} | ||
|
||
// Allocate device memory | ||
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA); | ||
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB); | ||
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC); | ||
|
||
// Copy data to device, use directly host buffers (not mdspans used to fill the data) | ||
alpaka::memcpy(queue, bufDevA, bufHostA); | ||
alpaka::memcpy(queue, bufDevB, bufHostB); | ||
alpaka::wait(queue); | ||
|
||
// Create mdspan views for device buffers using alpaka::experimental::getMdSpan | ||
auto mdDevA = alpaka::experimental::getMdSpan(bufDevA); | ||
auto mdDevB = alpaka::experimental::getMdSpan(bufDevB); | ||
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC); | ||
|
||
MatrixMulKernel kernel; | ||
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC); | ||
|
||
// Let alpaka calculate good block and grid sizes given our full problem extent | ||
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>( | ||
devAcc, | ||
bundeledKernel, | ||
extentC, | ||
Vec::ones(), | ||
false, | ||
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); | ||
|
||
// Execute the kernel | ||
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC); | ||
|
||
// Copy result back to host | ||
alpaka::memcpy(queue, bufHostC, bufDevC); | ||
alpaka::wait(queue); | ||
|
||
// Verify the result | ||
bool success = true; | ||
auto mdHostC = alpaka::experimental::getMdSpan(bufHostC); | ||
for(Idx i = 0; i < M; ++i) | ||
{ | ||
for(Idx j = 0; j < N; ++j) | ||
{ | ||
DataType expectedValue = 0.0f; | ||
for(Idx k = 0; k < K; ++k) | ||
{ | ||
expectedValue += mdHostA(i, k) * mdHostB(k, j); | ||
} | ||
if(mdHostC(i, j) != expectedValue) | ||
{ | ||
success = false; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N << " using mdspan " | ||
<< (success ? "succeeded" : "failed") << "!" << std::endl; | ||
if(!success) | ||
{ | ||
return EXIT_FAILURE; | ||
} | ||
|
||
return EXIT_SUCCESS; | ||
} | ||
|
||
auto main() -> int | ||
{ | ||
// Execute the example once for each enabled accelerator. | ||
// If you would like to execute it for a single accelerator only you can use the following code. | ||
// \code{.cpp} | ||
// auto tag = TagCpuSerial; | ||
// return example(tag); | ||
// \endcode | ||
// | ||
// valid tags: | ||
// TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, | ||
// TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, | ||
// TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel | ||
return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if
success
is false you should end withEXIT_FAILURE
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, added. Thanks a lot. (Tested by
ctest
for both true-positive, true-negative cases. ) Other examples like convolution1d,2d,2dwithMdSpan are correct in this sense.