From 31481cea2c7657ab16774aff4082b11c2d4373d3 Mon Sep 17 00:00:00 2001 From: mmoadeli Date: Mon, 30 Oct 2023 17:12:17 +0000 Subject: [PATCH] [SYCL][HIP] Support of AMD matrix core instructions (#11485) * Support one block AMD matrix core instructions for `__gfx90a__` architecture. * Supports `__builtin_amdgcn_mfma_i32_32x32x8i8`, `__builtin_amdgcn_mfma_i32_16x16x16i8`, `__builtin_amdgcn_mfma_f64_16x16x4f64`, `__builtin_amdgcn_mfma_f32_32x32x8bf16_1k`, `__builtin_amdgcn_mfma_f32_16x16x16bf16_1k`, `__builtin_amdgcn_mfma_f32_32x32x8f16` and `__builtin_amdgcn_mfma_f32_16x16x16f16` instructions. * Add HIP matrix core support into joint_matrix documentation. Should be merged after - https://github.com/intel/llvm/pull/11215 --------- Co-authored-by: Bing1 Yu Co-authored-by: mmoadeli --- .../sycl_ext_oneapi_matrix.asciidoc | 38 +- sycl/include/sycl/detail/defines.hpp | 8 +- .../sycl/ext/oneapi/matrix/matrix-hip.hpp | 404 ++++++++++++++++++ .../sycl/ext/oneapi/matrix/matrix-intel.hpp | 4 +- .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 79 +++- .../Matrix/joint_matrix_hip_apply.hpp | 99 +++++ .../test-e2e/Matrix/joint_matrix_hip_copy.hpp | 110 +++++ .../test-e2e/Matrix/joint_matrix_hip_fill.hpp | 87 ++++ .../Matrix/joint_matrix_hip_gfx90a.cpp | 52 +++ .../Matrix/joint_matrix_hip_half_gfx90a.cpp | 35 ++ .../test-e2e/Matrix/joint_matrix_hip_mfma.hpp | 115 +++++ sycl/test-e2e/lit.cfg.py | 2 + .../matrix/matrix-hip-bfloat16-float-test.cpp | 72 ++++ .../matrix/matrix-hip-double-double-test.cpp | 53 +++ .../hip/matrix/matrix-hip-half-float-test.cpp | 71 +++ .../hip/matrix/matrix-hip-int8-int32-test.cpp | 71 +++ 16 files changed, 1268 insertions(+), 32 deletions(-) create mode 100644 sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp create mode 100644 sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp create mode 100644 sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp create mode 100644 sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp create mode 100644 sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index aa103ebb3d282..ae92ffc505265 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -50,7 +50,7 @@ specification.* This extension is currently implemented in {dpcpp} only for devices that contain a matrix hardware, specifically Intel(R) Advanced Matrix Extensions (Intel(R) AMX), Intel(R) Xe Matrix Extensions (Intel(R) -XMX) and Nvidia(R) Tensor Cores. +XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores(R). The `joint_matrix` type and the `joint_matrix_mad` function are optional kernel features as defined in section 5.7 of the core SYCL @@ -67,8 +67,8 @@ implementation throws a synchronous exception with the == Overview Joint matrix is a SYCL extension for matrix hardware programming. It -unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs and -Nvidia Tensor Cores. This provides a portable and performant API for +unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs, +Nvidia Tensor Cores and AMD Matrix Cores(R). This provides a portable and performant API for users who want to build their own neural networks applications, perform custom optimizations, or experiment with new operations in a timely and performing manner. @@ -921,7 +921,8 @@ the type of the A matrix must be the same as the type of the B matrix. IMPORTANT: When compiling for the `ext_oneapi_cuda` backend the target -arch backend flag, `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`, must +arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx` +(or equivalents, e.g. `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`), must be used, where `sm_xx` must be a Compute Capability that is equal to or greater than the appropriate Minimum Compute Capability. When an executable has been compiled for `sm_xx`, if the executable is run on @@ -971,6 +972,34 @@ multiple of 4 when `T` is `float`; where `T` is the type of the `joint_matrix` elements. When `T` is not `half` or `float` there are no restrictions to `stride`. +==== AMD Matrix Cores Supported Combinations +The complete set of matrix data types and dimensions that are supported by +the `ext_oneapi_hip` backend are represented in the following +table. In this architecture's implementation, A and B matrices must have the same type. +Similarly, C and D matrices must share the same type. + +IMPORTANT: The supported instructions may be run on GFX90A (MI200, MI210, MI250 and MI250X GPUs) +architecture. When compiling for the `ext_oneapi_hip` backend the +target arch backend flag, `-fsycl-targets=amd_gpu_gfx90a`, must +be used. An attempt to run the compiled code on an unsupported architecture will throw an error. + + +[frame="none",options="header"] +|====================== +| A and B type | C and D type | M | N | K +.2+| `matrix_type::fp16` .2+| `matrix_type::fp32` +|32 |32 |8 +|16 |16 |16 +.2+| `matrix_type::sint8` .2+| `matrix_type::sint32` +|32 |32 |8 +|16 |16 |16 +.2+|`matrix_type::bf16` .2+|`matrix_type::fp32` +|32 |32 |8 +|16 |16 |16 +.1+|`matrix_type::fp64` .1+| `matrix_type::fp64` +|16 |16 |4 +|====================== + === Revision History [frame="none",options="header"] @@ -990,4 +1019,5 @@ the Intel-specifics to a separate extension document type, runtime query, and supported combinations appendix for Intel AMX and Intel XMX |7 |2023-04-11 |Jack Kirk |Add Nvidia Tensor Cores supported combinations +|8 |2023-10-05 |Mahmoud Moadeli |Add AMD Matrix Core supported combinations |====================== diff --git a/sycl/include/sycl/detail/defines.hpp b/sycl/include/sycl/detail/defines.hpp index 5d44727d71fb1..de2de047528b1 100644 --- a/sycl/include/sycl/detail/defines.hpp +++ b/sycl/include/sycl/detail/defines.hpp @@ -39,9 +39,11 @@ #define __SYCL_TYPE(x) #endif -// joint matrix should only be included by default for SPIR or NVPTX backends -#if defined __SPIR__ || defined __NVPTX__ || !defined __SYCL_DEVICE_ONLY__ +// joint matrix should only be included by default for SPIR, NVPTX or HIP(GFX90A +// only) backends +#if defined __SPIR__ || defined __NVPTX__ || !defined __SYCL_DEVICE_ONLY__ || \ + defined __gfx90a__ #ifndef SYCL_EXT_ONEAPI_MATRIX_VERSION #define SYCL_EXT_ONEAPI_MATRIX_VERSION 4 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION -#endif // __SPIR__ || __NVPTX__ || !__SYCL_DEVICE_ONLY +#endif // __SPIR__ || __NVPTX__ || !__SYCL_DEVICE_ONLY || __gfx90a__ diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp new file mode 100644 index 0000000000000..7f9f9b1219cf4 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp @@ -0,0 +1,404 @@ + +//===-------- matrix-hip.hpp - matrix ext impl ---*- C++ -*-------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===-------------------------------------------------------------------=== // + +#pragma once +#include "matrix-unified-utils.hpp" +#include + +#define __HIP_PLATFORM_AMD_MFMA__ + +namespace sycl { +inline namespace _V1 { +namespace ext { +namespace oneapi { +namespace detail { + +constexpr int WAVEFRONT_SIZE = 64; + +template +struct joint_matrix_hip; + +using bfloat16x4 = __attribute__((__vector_size__(4 * sizeof(__bf16)))) __fp16; +using float16x4 = __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16; +using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; +using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float; +using int32x4 = __attribute__((__vector_size__(4 * sizeof(int32_t)))) int; +using int32x16 = __attribute__((__vector_size__(16 * sizeof(int32_t)))) int; +using doublex4 = __attribute__((__vector_size__(4 * sizeof(double)))) double; + +template struct to_hip_type { + using type = T; +}; + +template <> struct to_hip_type { + using type = __bf16; +}; + +template <> struct to_hip_type { + using type = __fp16; +}; + +template <> struct to_hip_type { + using type = int32_t; +}; + +#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR + +#define __SYCL_JOINT_MATRIX_OVERLOAD_ARR(TYPE, USE, M, N, SIZE) \ + template \ + struct joint_matrix_hip< \ + TYPE, sycl::ext::oneapi::experimental::matrix::use::USE, M, N, Layout, \ + typename std::enable_if_t< \ + Layout == \ + sycl::ext::oneapi::experimental::matrix::layout::row_major || \ + Layout == \ + sycl::ext::oneapi::experimental::matrix::layout::col_major>> { \ + sycl::marray wi_marray; \ + }; + +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, a, 16, 16, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, b, 16, 16, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, a, 32, 8, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, b, 8, 32, 4) + +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, a, 16, 16, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, b, 16, 16, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, a, 32, 8, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, b, 8, 32, 4) + +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(double, a, 16, 4, 1) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(double, b, 4, 16, 1) + +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, a, 32, 8, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, b, 8, 32, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, a, 16, 16, 4) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, b, 16, 16, 4) + +#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR + +#define __SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(TYPE, M, N) \ + template <> \ + struct joint_matrix_hip< \ + TYPE, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, \ + sycl::ext::oneapi::experimental::matrix::layout::dynamic> { \ + sycl::marray wi_marray; \ + }; + +__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(float, 16, 16) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(float, 32, 32) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(double, 16, 16) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(int32_t, 32, 32) +__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(int32_t, 16, 16) + +#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC + +template +void load_accumulator_layoutT( + joint_matrix_hip< + S, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res, + multi_ptr src, size_t stride, Group &sg) { + const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] + + sg.get_local_linear_id(); + + if constexpr (std::is_same_v) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == + sycl::ext::oneapi::experimental::matrix::layout::row_major) { + for (int i = 0; i < 4; ++i) { + const int s_idx = thread_x + i * 4 * stride + thread_y * stride; + res.wi_marray[i] = src[s_idx]; + } + } else { + for (int i = 0; i < 4; ++i) { + const int s_idx = i * 4 + thread_x * stride + thread_y; + res.wi_marray[i] = src[s_idx]; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + if constexpr (M == 16 && N == 16) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout:: + row_major) { + for (int i = 0; i < 4; ++i) { + const int s_idx = thread_x + i * stride + thread_y * 4 * stride; + res.wi_marray[i] = src[s_idx]; + } + } else { + for (int i = 0; i < 4; ++i) { + const int s_idx = i + thread_x * stride + thread_y * 4; + res.wi_marray[i] = src[s_idx]; + } + } + } else if constexpr (M == 32 && N == 32) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout:: + row_major) { + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 4; ++i) { + const int s_idx = + thread_x + i * stride + thread_y * 4 * stride + j * 8 * N; + res.wi_marray[i + 4 * j] = src[s_idx]; + } + } + } else { + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 4; ++i) { + const int s_idx = i + thread_x * stride + thread_y * 4 + j * 8; + res.wi_marray[i + 4 * j] = src[s_idx]; + } + } + } + } + } +} + +template < + typename Group, typename S, typename T, size_t M, size_t N, + access::address_space Space, access::decorated IsDecorated, + typename = std::enable_if_t>>> +void load_accumulator_hip( + joint_matrix_hip< + S, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res, + multi_ptr src, size_t stride, + sycl::ext::oneapi::experimental::matrix::layout layout, Group &sg) { + if (layout == sycl::ext::oneapi::experimental::matrix::layout::row_major) + load_accumulator_layoutT< + sycl::ext::oneapi::experimental::matrix::layout::row_major>(res, src, + stride, sg); + else + load_accumulator_layoutT< + sycl::ext::oneapi::experimental::matrix::layout::col_major>(res, src, + stride, sg); +} + +template < + typename Group, typename S, typename T, size_t M, size_t N, + sycl::ext::oneapi::experimental::matrix::use Use, + sycl::ext::oneapi::experimental::matrix::layout Layout, + access::address_space Space, access::decorated IsDecorated, + typename = typename std::enable_if_t< + (Layout == sycl::ext::oneapi::experimental::matrix::layout::row_major || + Layout == + sycl::ext::oneapi::experimental::matrix::layout::col_major) && + std::is_same_v>>> +void load_multiplicand_hip(joint_matrix_hip &res, + multi_ptr src, size_t stride, + Group &sg) { + const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] + + sg.get_local_linear_id(); + + if constexpr (std::is_same_v) { + if constexpr (Layout == + sycl::ext::oneapi::experimental::matrix::layout::row_major) { + res.wi_marray[0] = src[idx]; + } else { + res.wi_marray[0] = src[(idx % M) * stride + idx / M]; + } + } else { + constexpr int Dim = (M == 16) ? 16 : 32; + + const auto thread_x = idx % Dim; + const auto thread_y = idx / Dim; + + if constexpr (Layout == + sycl::ext::oneapi::experimental::matrix::layout::col_major) { + for (int i = 0; i < 4; ++i) { + const int c_idx = thread_x * stride + i + thread_y * 4; + res.wi_marray[i] = src[c_idx]; + } + } else { + for (int i = 0; i < 4; ++i) { + const int r_idx = thread_x + i * stride + thread_y * stride * 4; + res.wi_marray[i] = src[r_idx]; + } + } + } +} + +template +void store_layoutT( + const joint_matrix_hip< + T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src, + multi_ptr dst, size_t stride, Group &sg) { + const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] + + sg.get_local_linear_id(); + + if constexpr (std::is_same_v) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == + sycl::ext::oneapi::experimental::matrix::layout::row_major) { + for (int i = 0; i < 4; ++i) { + const int d_idx = thread_x + i * 4 * stride + thread_y * stride; + dst[d_idx] = src.wi_marray[i]; + } + } else { + for (int i = 0; i < 4; ++i) { + const int d_idx = i * 4 + thread_x * stride + thread_y; + dst[d_idx] = src.wi_marray[i]; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + if constexpr (M == 16 && N == 16) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout:: + row_major) { + for (int i = 0; i < 4; ++i) { + const int d_idx = thread_x + i * stride + thread_y * 4 * stride; + dst[d_idx] = src.wi_marray[i]; + } + } else { + for (int i = 0; i < 4; ++i) { + const int d_idx = i + thread_x * stride + thread_y * 4; + dst[d_idx] = src.wi_marray[i]; + } + } + } else if constexpr (M == 32 && N == 32) { + const auto thread_x = idx % N; + const auto thread_y = idx / N; + + if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout:: + row_major) { + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 4; ++i) { + const int d_idx = + thread_x + i * stride + thread_y * 4 * stride + j * 8 * stride; + dst[d_idx] = src.wi_marray[i + 4 * j]; + } + } + } else { + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 4; ++i) { + const int d_idx = i + thread_x * stride + thread_y * 4 + j * 8; + dst[d_idx] = src.wi_marray[i + 4 * j]; + } + } + } + } + } +} + +template +void joint_matrix_store_hip( + const joint_matrix_hip< + T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src, + multi_ptr dst, size_t stride, + sycl::ext::oneapi::experimental::matrix::layout layout, Group &sg) { + if (sycl::ext::oneapi::experimental::matrix::layout::row_major == layout) { + store_layoutT( + src, dst, stride, sg); + } else { + store_layoutT( + src, dst, stride, sg); + } +} + +template +void joint_matrix_mad_hip( + joint_matrix_hip< + Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &D, + const joint_matrix_hip &A, + const joint_matrix_hip &B, + const joint_matrix_hip< + Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N, + sycl::ext::oneapi::experimental::matrix::layout::dynamic> &C) { + if constexpr (std::is_same_v) { + if constexpr (M == 16 && N == 16) { + auto result = __builtin_amdgcn_mfma_f32_16x16x16f16( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 4; ++i) + D.wi_marray[i] = result[i]; + } else if constexpr (M == 32 && N == 32) { + auto result = __builtin_amdgcn_mfma_f32_32x32x8f16( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 16; ++i) + D.wi_marray[i] = result[i]; + } + } else if constexpr (std::is_same_v) { + if constexpr (M == 16 && N == 16) { + auto result = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 4; ++i) + D.wi_marray[i] = result[i]; + } else if constexpr (M == 32 && N == 32) { + auto result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 16; ++i) + D.wi_marray[i] = result[i]; + } + } else if constexpr (std::is_same_v) { + if constexpr (M == 16 && N == 16) { + auto result = __builtin_amdgcn_mfma_f64_16x16x4f64( + A.wi_marray[0], B.wi_marray[0], + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 4; ++i) + D.wi_marray[i] = result[i]; + } + } else if constexpr (std::is_same_v) { + if constexpr (M == 16 && N == 16) { + auto result = __builtin_amdgcn_mfma_i32_16x16x16i8( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 4; ++i) + D.wi_marray[i] = result[i]; + } else if constexpr (M == 32 && N == 32) { + auto result = __builtin_amdgcn_mfma_i32_32x32x8i8( + *reinterpret_cast(&A.wi_marray), + *reinterpret_cast(&B.wi_marray), + *reinterpret_cast(&C.wi_marray), 0, 0, 0); + for (int i = 0; i < 16; ++i) + D.wi_marray[i] = result[i]; + } + } +} + +} // namespace detail +} // namespace oneapi +} // namespace ext +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index b852e3f1ff3f5..e1efa463939dd 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -533,8 +533,8 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_apply( #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) std::ignore = sg; - for (int i = 0; i < jm.cuda_impl.wi_marray.size(); i++) { - lambda(jm.cuda_impl.wi_marray[i]); + for (int i = 0; i < jm.matrix_impl.wi_marray.size(); i++) { + lambda(jm.matrix_impl.wi_marray[i]); } #else // NVPTX using storage_element_type = diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 3d128633c0735..d8a751680900a 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -10,9 +10,13 @@ #include "matrix-intel.hpp" -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) #include "matrix-tensorcores.hpp" -#endif +#elif defined(__gfx90a__) +#include "matrix-hip.hpp" +#endif // defined(__NVPTX__) +#endif // defined(__SYCL_DEVICE_ONLY__) #include // for address_space #include // for __SYCL_ALWAYS_... @@ -42,15 +46,17 @@ struct joint_matrix { #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) sycl::ext::oneapi::detail::joint_matrix_cuda - cuda_impl; + matrix_impl; +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + sycl::ext::oneapi::detail::joint_matrix_hip + matrix_impl; #elif defined(__SPIR__) __spv::__spirv_JointMatrixINTEL< T, Rows, Cols, spv_matrix_layout_traits::value, spv_scope_traits::value, spv_matrix_use_traits::value> *spvm; #else - static_assert( - false, - "The joint_matrix API is only supported by the Intel and CUDA backends"); + static_assert(false, "The joint_matrix API is only supported by the Intel, " + "CUDA and HIP (GFX90A) backends"); #endif // defined(__NVPTX__) #endif // defined(__SYCL_DEVICE_ONLY__) @@ -81,10 +87,10 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_apply(Group sg, joint_matrix &jm, F &&lambda) { #if defined(__SYCL_DEVICE_ONLY__) -#if defined(__NVPTX__) +#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__) std::ignore = sg; - for (int i = 0; i < jm.cuda_impl.wi_marray.size(); i++) { - lambda(jm.cuda_impl.wi_marray[i]); + for (int i = 0; i < jm.matrix_impl.wi_marray.size(); i++) { + lambda(jm.matrix_impl.wi_marray[i]); } #else // NVPTX using storage_element_type = @@ -114,8 +120,8 @@ joint_matrix_fill(Group, joint_matrix &res, const T2 &v) { #if defined(__SYCL_DEVICE_ONLY__) -#if defined(__NVPTX__) - res.cuda_impl.wi_marray = v; +#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__) + res.matrix_impl.wi_marray = v; #else using storage_element_type = typename oneapi::detail::jm_type_interpretation_helper_trait< @@ -140,7 +146,7 @@ template < std::enable_if_t>::value, bool> = true> inline __SYCL_ALWAYS_INLINE void joint_matrix_load( - Group, + Group &sg, joint_matrix &res, multi_ptr src, size_t stride, @@ -149,9 +155,14 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( static_assert(Space != access::address_space::private_space, "Joint Matrix doesn't support load from private memory!"); #if defined(__NVPTX__) - sycl::ext::oneapi::detail::load_accumulator_cuda(res.cuda_impl, src, stride, + std::ignore = sg; + sycl::ext::oneapi::detail::load_accumulator_cuda(res.matrix_impl, src, stride, Layout); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + sycl::ext::oneapi::detail::load_accumulator_hip(res.matrix_impl, src, stride, + Layout, sg); #else + std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(src); switch (Layout) { @@ -184,6 +195,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( } #endif // defined(__NVPTX__) #else + std::ignore = sg; std::ignore = res; std::ignore = src; std::ignore = stride; @@ -202,17 +214,23 @@ template < std::is_same, float>::value), bool> = true> inline __SYCL_ALWAYS_INLINE void -joint_matrix_load(Group, +joint_matrix_load(Group &sg, joint_matrix &res, multi_ptr src, size_t stride) { #if defined(__SYCL_DEVICE_ONLY__) static_assert(Space != access::address_space::private_space, "Joint Matrix doesn't support load from private memory!"); #if defined(__NVPTX__) + std::ignore = sg; sycl::ext::oneapi::detail::load_multiplicand_cuda( - res.cuda_impl, src, stride); + res.matrix_impl, src, stride); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + sycl::ext::oneapi::detail::load_multiplicand_hip( + res.matrix_impl, src, stride, sg); #else + std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(src); res.spvm = @@ -223,6 +241,7 @@ joint_matrix_load(Group, spv_scope_traits::value); #endif // defined(__NVPTX__) #else + std::ignore = sg; std::ignore = res; std::ignore = src; std::ignore = stride; @@ -234,7 +253,7 @@ joint_matrix_load(Group, template inline __SYCL_ALWAYS_INLINE void joint_matrix_store( - Group, + Group &sg, const joint_matrix &src, @@ -244,10 +263,16 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( static_assert(Space != access::address_space::private_space, "Joint Matrix doesn't support store to private memory!"); #if defined(__NVPTX__) + std::ignore = sg; sycl::ext::oneapi::detail::joint_matrix_store_cuda(src.cuda_impl, dst, - stride, Layout); + Space>( + src.matrix_impl, dst, stride, Layout); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + sycl::ext::oneapi::detail::joint_matrix_store_hip(src.matrix_impl, dst, + stride, Layout, sg); #else + std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(dst); switch (Layout) { @@ -280,6 +305,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( } #endif // defined(__NVPTX__) #else + std::ignore = sg; std::ignore = src; std::ignore = dst; std::ignore = stride; @@ -318,11 +344,20 @@ joint_matrix_mad( if constexpr (std::is_same::value) { sycl::ext::oneapi::detail::joint_matrix_mad_cuda( - D.cuda_impl, A.cuda_impl, B.cuda_impl, C.cuda_impl); + D.matrix_impl, A.matrix_impl, B.matrix_impl, C.matrix_impl); } else { assert(false && "Ta != Tb : In the CUDA backend joint_matrix_mad " "requires that joint_matrix data types Ta and Tb match"); } +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + if constexpr (std::is_same::value) { + sycl::ext::oneapi::detail::joint_matrix_mad_hip( + D.matrix_impl, A.matrix_impl, B.matrix_impl, C.matrix_impl); + } else { + assert(false && "Ta != Tb : In the HIP backend joint_matrix_mad " + "requires that joint_matrix data types Ta and Tb match"); + } #else if constexpr (std::is_same::value && std::is_same::value && @@ -353,11 +388,9 @@ void joint_matrix_copy( Group sg, joint_matrix &src, joint_matrix &dst) { #if defined(__SYCL_DEVICE_ONLY__) -#if defined(__NVPTX__) +#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__) std::ignore = sg; - for (int i = 0; i < src.cuda_impl.wi_marray.size(); i++) { - dst.cuda_impl.wi_marray[i] = src.cuda_impl.wi_marray[i]; - } + dst.matrix_impl.wi_marray = src.matrix_impl.wi_marray; #else using storage_element_type = typename oneapi::detail::jm_type_interpretation_helper_trait< diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp new file mode 100644 index 0000000000000..296128915a136 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp @@ -0,0 +1,99 @@ + +#include + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::bfloat16; + +template +void hip_matrix_apply() { + InType A[M * K]; + InType B[K * N]; + OutType C[M * N]; + OutType D[M * N]; + OutType E[M * N]; + + for (auto i = 0; i < M * K; ++i) { + A[i] = 1; + } + + for (auto i = 0; i < K * N; ++i) { + B[i] = 2; + } + + for (auto i = 0; i < M * N; ++i) { + D[i] = 0; + C[i] = 3; + E[i] = 3; + } + + try { + auto defaultQueue = sycl::queue{}; + + auto bufA = sycl::buffer{A, sycl::range{M * K}}; + auto bufB = sycl::buffer{B, sycl::range{K * N}}; + auto bufC = sycl::buffer{C, sycl::range{M * N}}; + auto bufD = sycl::buffer{D, sycl::range{M * N}}; + + defaultQueue + .submit([&](sycl::handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + sycl::accessor accB{bufB, cgh, sycl::read_write}; + sycl::accessor accC{bufC, cgh, sycl::read_only}; + sycl::accessor accD{bufD, cgh, sycl::write_only}; + + cgh.parallel_for( + sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) { + auto sg = idx.get_sub_group(); + joint_matrix sub_c; + joint_matrix + sub_b; + joint_matrix + sub_a; + + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr(), K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr(), N); + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr(), N, + layout::row_major); + + joint_matrix_apply(sg, sub_a, [=](InType &v) { v *= 2; }); + joint_matrix_apply(sg, sub_b, [=](InType &v) { v *= 3; }); + joint_matrix_apply(sg, sub_c, [=](OutType &v) { v *= 4; }); + + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + + joint_matrix_store( + sg, sub_c, + accD.template get_multi_ptr(), N, + layout::row_major); + }); + }) + .wait(); + + defaultQueue.throw_asynchronous(); + } catch (const sycl::exception &e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } + + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + E[m * N + n] *= 4; + for (int k = 0; k < K; k++) { + E[m * N + n] += A[m * K + k] * 2 * B[k * N + n] * 3; + } + } + } + + for (int i = 0; i < M * N; ++i) { + assert(D[i] == E[i] && "Unexpected difference"); + } +}; diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp new file mode 100644 index 0000000000000..b1ed65bc8fc93 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp @@ -0,0 +1,110 @@ +#include + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::bfloat16; + +template +void hip_matrix_copy() { + InType A[M * K]; + InType B[K * N]; + OutType C[M * N]; + OutType D[M * N]; + OutType E[M * N]; + + std::mt19937 gen(0); + std::uniform_real_distribution dist(-10, 10); + + for (auto i = 0; i < M * K; ++i) { + A[i] = static_cast(dist(gen)); + } + + for (auto i = 0; i < K * N; ++i) { + B[i] = static_cast(dist(gen)); + } + + for (auto i = 0; i < M * N; ++i) { + D[i] = 0; + C[i] = static_cast(dist(gen)); + if (OutLayout == layout::row_major) + E[i] = C[i]; + else + E[(i % N) * M + int(i / M)] = C[i]; + } + + try { + auto defaultQueue = sycl::queue{}; + + auto bufA = sycl::buffer{A, sycl::range{M * K}}; + auto bufB = sycl::buffer{B, sycl::range{K * N}}; + auto bufC = sycl::buffer{C, sycl::range{M * N}}; + auto bufD = sycl::buffer{D, sycl::range{M * N}}; + + defaultQueue + .submit([&](sycl::handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + sycl::accessor accC{bufC, cgh, sycl::read_only}; + sycl::accessor accD{bufD, cgh, sycl::write_only}; + + cgh.parallel_for( + sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) { + auto sg = idx.get_sub_group(); + joint_matrix sub_c, + sub_c_copy; + joint_matrix + sub_b, sub_b_copy; + joint_matrix + sub_a, sub_a_copy; + + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr(), K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr(), N); + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr(), N, + layout::row_major); + + joint_matrix_copy(sg, sub_c, sub_c_copy); + joint_matrix_copy(sg, sub_a, sub_a_copy); + joint_matrix_copy(sg, sub_b, sub_b_copy); + + joint_matrix_mad(sg, sub_c_copy, sub_a_copy, sub_b_copy, + sub_c_copy); + + joint_matrix_store( + sg, sub_c_copy, + accD.template get_multi_ptr(), N, + OutLayout); + }); + }) + .wait(); + + defaultQueue.throw_asynchronous(); + } catch (const sycl::exception &e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } + + for (auto m = 0; m < M; m++) { + for (auto n = 0; n < N; n++) { + for (auto k = 0; k < K; k++) { + if (OutLayout == layout::row_major) + E[m * N + n] += A[m * K + k] * B[k * N + n]; + else + E[n * M + m] += A[m * K + k] * B[k * N + n]; + } + } + } + + for (auto i = 0; i < M * N; ++i) { + assert(abs(D[i] - E[i]) < 100 && "Unexpected difference"); + } +}; diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp new file mode 100644 index 0000000000000..642562aed4de9 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp @@ -0,0 +1,87 @@ +#include + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::bfloat16; + +template +void hip_matrix_fill() { + InType A[M * K]; + InType B[K * N]; + OutType C[M * N]; + OutType D[M * N]; + OutType E[M * N]; + + for (auto i = 0; i < M * K; ++i) { + A[i] = 1; + } + + for (auto i = 0; i < K * N; ++i) { + B[i] = 2; + } + + for (auto i = 0; i < M * N; ++i) { + D[i] = 0; + C[i] = 3; + E[i] = 3; + } + + try { + auto defaultQueue = sycl::queue{}; + + auto bufA = sycl::buffer{A, sycl::range{M * K}}; + auto bufB = sycl::buffer{B, sycl::range{K * N}}; + auto bufC = sycl::buffer{C, sycl::range{M * N}}; + auto bufD = sycl::buffer{D, sycl::range{M * N}}; + + defaultQueue + .submit([&](sycl::handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + sycl::accessor accC{bufC, cgh, sycl::read_only}; + sycl::accessor accD{bufD, cgh, sycl::write_only}; + + cgh.parallel_for( + sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) { + auto sg = idx.get_sub_group(); + joint_matrix + sub_c{}; + joint_matrix + sub_b{}; + joint_matrix + sub_a{}; + + joint_matrix_fill(sg, sub_a, 1); + joint_matrix_fill(sg, sub_b, 2); + joint_matrix_fill(sg, sub_c, 3); + + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + + joint_matrix_store( + sg, sub_c, + accD.template get_multi_ptr(), N, + layout::row_major); + }); + }) + .wait(); + + defaultQueue.throw_asynchronous(); + } catch (const sycl::exception &e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } + + for (auto m = 0; m < M; m++) { + for (auto n = 0; n < N; n++) { + for (auto k = 0; k < K; k++) { + E[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } + } + + for (auto i = 0; i < M * N; ++i) { + assert(D[i] == E[i] && "Unexpected difference"); + } +}; diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp new file mode 100644 index 0000000000000..e9dc8659e69ae --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp @@ -0,0 +1,52 @@ +// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out +// RUN: %{run} %t.out + +// REQUIRES: gpu-amd-gfx90a + +#include "joint_matrix_hip_apply.hpp" +#include "joint_matrix_hip_copy.hpp" +#include "joint_matrix_hip_fill.hpp" +#include "joint_matrix_hip_mfma.hpp" + +template void matrix_mfma() { + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); +} + +int main() { + matrix_mfma<1>(); + matrix_mfma<2>(); + matrix_mfma<3>(); + matrix_mfma<4>(); + + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + + hip_matrix_fill(); + hip_matrix_fill(); + hip_matrix_fill(); + hip_matrix_fill(); + hip_matrix_fill(); + + hip_matrix_apply(); + hip_matrix_apply(); + hip_matrix_apply(); + hip_matrix_apply(); + hip_matrix_apply(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp new file mode 100644 index 0000000000000..96aacbac9c280 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp @@ -0,0 +1,35 @@ +// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out +// RUN: %{run} %t.out + +// REQUIRES: gpu-amd-gfx90a +// REQUIRES: aspect-fp16 + +#include "joint_matrix_hip_apply.hpp" +#include "joint_matrix_hip_copy.hpp" +#include "joint_matrix_hip_fill.hpp" +#include "joint_matrix_hip_mfma.hpp" + +template void half_matrix_mfma() { + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); + hip_matrix_mfma(); +} + +int main() { + half_matrix_mfma<1>(); + half_matrix_mfma<2>(); + half_matrix_mfma<3>(); + half_matrix_mfma<4>(); + + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + hip_matrix_copy(); + + hip_matrix_fill(); + hip_matrix_fill(); + + hip_matrix_apply(); + hip_matrix_apply(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp new file mode 100644 index 0000000000000..650bcbaa0908b --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp @@ -0,0 +1,115 @@ + +#include + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::bfloat16; + +template +void hip_matrix_mfma() { + InType A[M * K * KX]; + InType B[K * N * KX]; + OutType C[M * N]; + OutType D[M * N]; + OutType E[M * N]; + + std::mt19937 gen(0); + std::uniform_real_distribution dist(-10, 10); + + for (auto i = 0; i < M * K * KX; ++i) { + A[i] = static_cast(dist(gen)); + } + + for (auto i = 0; i < K * N * KX; ++i) { + B[i] = static_cast(dist(gen)); + } + + for (auto i = 0; i < M * N; ++i) { + D[i] = 0; + C[i] = static_cast(dist(gen)); + if (OutLayout == layout::row_major) + E[i] = C[i]; + else + E[(i % N) * M + int(i / M)] = C[i]; + } + + try { + auto defaultQueue = sycl::queue{}; + + auto bufA = sycl::buffer{A, sycl::range{M * K * KX}}; + auto bufB = sycl::buffer{B, sycl::range{K * N * KX}}; + auto bufC = sycl::buffer{C, sycl::range{M * N}}; + auto bufD = sycl::buffer{D, sycl::range{M * N}}; + + defaultQueue + .submit([&](sycl::handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + sycl::accessor accC{bufC, cgh, sycl::read_only}; + sycl::accessor accD{bufD, cgh, sycl::write_only}; + + cgh.parallel_for( + sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) { + auto sg = idx.get_sub_group(); + joint_matrix sub_c; + joint_matrix + sub_b; + joint_matrix + sub_a; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr(), N, + layout::row_major); + + for (auto kx = 0; kx < KX; ++kx) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + kx * K, + K * KX); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + kx * K * N, + N); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + + joint_matrix_store( + sg, sub_c, + accD.template get_multi_ptr(), N, + OutLayout); + }); + }) + .wait(); + + defaultQueue.throw_asynchronous(); + } catch (const sycl::exception &e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } + + constexpr int LDA = K * KX; + + for (auto m = 0; m < M; m++) { + for (auto n = 0; n < N; n++) { + OutType e = 0; + for (auto k = 0; k < LDA; k++) { + e += A[m * LDA + k] * B[k * N + n]; + } + if (OutLayout == layout::row_major) + E[m * N + n] += e; + else + E[n * M + m] += e; + } + } + + for (auto i = 0; i < M * N; ++i) { + assert(abs(D[i] - E[i]) < 100 && "Unexpected difference"); + } +}; diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 14c9dc4ba654a..a25323c6e50aa 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -275,6 +275,8 @@ devices = set() sp = subprocess.check_output(sycl_ls, text=True) for line in sp.splitlines(): + if "gfx90a" in line: + config.available_features.add("gpu-amd-gfx90a") (backend, device, _) = line[1:].split(':', 2) devices.add('{}:{}'.format(backend, device)) config.sycl_devices = list(devices) diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp new file mode 100644 index 0000000000000..9f3454d5ef83a --- /dev/null +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp @@ -0,0 +1,72 @@ +// REQUIRES: hip + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::bfloat16; + +int main() { + + buffer bufA(nullptr, range<1>(1)); + buffer bufB(nullptr, range<1>(1)); + buffer bufC(nullptr, range<1>(1)); + buffer bufD(nullptr, range<1>(1)); + + queue q; + + q.submit([&](handler &cgh) { + sycl::accessor + accA(bufA, cgh); + sycl::accessor + accB(bufB, cgh); + sycl::accessor + accC(bufC, cgh); + sycl::accessor + accD(bufD, cgh); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %{{.*}}, <4 x i16> %{{.*}} <4 x float> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 16, layout::row_major); + }); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> {{.*}}, <4 x i16> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 32, layout::row_major); + }); + }); + + return 0; +}; diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp new file mode 100644 index 0000000000000..8475afee205b7 --- /dev/null +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp @@ -0,0 +1,53 @@ +// REQUIRES: hip + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +int main() { + + buffer bufA(nullptr, range<1>(1)); + buffer bufB(nullptr, range<1>(1)); + buffer bufC(nullptr, range<1>(1)); + buffer bufD(nullptr, range<1>(1)); + + queue q; + + q.submit([&](handler &cgh) { + sycl::accessor + accA(bufA, cgh); + sycl::accessor + accB(bufB, cgh); + sycl::accessor + accC(bufC, cgh); + sycl::accessor + accD(bufD, cgh); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %{{.*}}, double %{{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 16, layout::row_major); + }); + }); + + return 0; +}; diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp new file mode 100644 index 0000000000000..9019233a1fa8b --- /dev/null +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp @@ -0,0 +1,71 @@ +// REQUIRES: hip + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +int main() { + + buffer bufA(nullptr, range<1>(1)); + buffer bufB(nullptr, range<1>(1)); + buffer bufC(nullptr, range<1>(1)); + buffer bufD(nullptr, range<1>(1)); + + queue q; + + q.submit([&](handler &cgh) { + sycl::accessor + accA(bufA, cgh); + sycl::accessor + accB(bufB, cgh); + sycl::accessor + accC(bufC, cgh); + sycl::accessor + accD(bufD, cgh); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 16, layout::row_major); + }); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> {{.*}}, <4 x half> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 32, layout::row_major); + }); + }); + + return 0; +}; diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp new file mode 100644 index 0000000000000..f2c7b1ec8c08c --- /dev/null +++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp @@ -0,0 +1,71 @@ +// REQUIRES: hip + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +int main() { + + buffer bufA(nullptr, range<1>(1)); + buffer bufB(nullptr, range<1>(1)); + buffer bufC(nullptr, range<1>(1)); + buffer bufD(nullptr, range<1>(1)); + + queue q; + + q.submit([&](handler &cgh) { + sycl::accessor + accA(bufA, cgh); + sycl::accessor + accB(bufB, cgh); + sycl::accessor + accC(bufC, cgh); + sycl::accessor + accD(bufD, cgh); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 16, layout::row_major); + }); + + cgh.parallel_for( + nd_range<2>({1, 64}, {1, 64}), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] { + sycl::sub_group sg = item.get_sub_group(); + + joint_matrix sub_c{}; + joint_matrix + sub_a{}; + joint_matrix + sub_b{}; + + // CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0) + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + joint_matrix_store( + sg, sub_c, accD.template get_multi_ptr(), + 32, layout::row_major); + }); + }); + + return 0; +};