diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
index aa103ebb3d282..ae92ffc505265 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
@@ -50,7 +50,7 @@ specification.*
 This extension is currently implemented in {dpcpp} only for devices
 that contain a matrix hardware, specifically Intel(R) Advanced Matrix
 Extensions (Intel(R) AMX), Intel(R) Xe Matrix Extensions (Intel(R)
-XMX) and Nvidia(R) Tensor Cores.
+XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores(R).
 
 The `joint_matrix` type and the `joint_matrix_mad` function are
 optional kernel features as defined in section 5.7 of the core SYCL
@@ -67,8 +67,8 @@ implementation throws a synchronous exception with the
 
 == Overview
 Joint matrix is a SYCL extension for matrix hardware programming. It
-unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs and
-Nvidia Tensor Cores. This provides a portable and performant API for
+unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs,
+Nvidia Tensor Cores and AMD Matrix Cores(R). This provides a portable and performant API for
 users who want to build their own neural networks applications,
 perform custom optimizations, or experiment with new operations in a
 timely and performing manner.
@@ -921,7 +921,8 @@ the type of the A matrix must be the same as the type of the B
 matrix.
 
 IMPORTANT: When compiling for the `ext_oneapi_cuda` backend the target
-arch backend flag, `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`, must
+arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx`
+(or equivalents, e.g. `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`), must
 be used, where `sm_xx` must be a Compute Capability that is equal to
 or greater than the appropriate Minimum Compute Capability. When an
 executable has been compiled for `sm_xx`, if the executable is run on
@@ -971,6 +972,34 @@ multiple of 4 when `T` is `float`; where `T` is the type of the
 `joint_matrix` elements. When `T` is not `half` or `float` there are
 no restrictions to `stride`.
 
+==== AMD Matrix Cores Supported Combinations
+The complete set of matrix data types and dimensions that are supported by
+the `ext_oneapi_hip` backend are represented in the following
+table. In this architecture's implementation, A and B matrices must have the same type. 
+Similarly, C and D matrices must share the same type.
+
+IMPORTANT: The supported instructions may be run on GFX90A (MI200, MI210, MI250 and MI250X GPUs)
+architecture. When compiling for the `ext_oneapi_hip` backend the 
+target arch backend flag, `-fsycl-targets=amd_gpu_gfx90a`, must
+be used. An attempt to run the compiled code on an unsupported architecture will throw an error. 
+
+
+[frame="none",options="header"]
+|======================
+| A and B type | C and D type | M | N | K
+.2+| `matrix_type::fp16`  .2+| `matrix_type::fp32`
+|32 |32 |8 
+|16 |16 |16
+.2+| `matrix_type::sint8`  .2+| `matrix_type::sint32`
+|32 |32 |8 
+|16 |16 |16
+.2+|`matrix_type::bf16`  .2+|`matrix_type::fp32`
+|32 |32 |8 
+|16 |16 |16
+.1+|`matrix_type::fp64`  .1+| `matrix_type::fp64`
+|16 |16 |4
+|======================
+
 === Revision History
 
 [frame="none",options="header"]
@@ -990,4 +1019,5 @@ the Intel-specifics to a separate extension document
 type, runtime query, and supported combinations appendix for Intel AMX
 and Intel XMX
 |7   |2023-04-11 |Jack Kirk |Add Nvidia Tensor Cores supported combinations
+|8   |2023-10-05 |Mahmoud Moadeli |Add AMD Matrix Core supported combinations
 |======================
diff --git a/sycl/include/sycl/detail/defines.hpp b/sycl/include/sycl/detail/defines.hpp
index 5d44727d71fb1..de2de047528b1 100644
--- a/sycl/include/sycl/detail/defines.hpp
+++ b/sycl/include/sycl/detail/defines.hpp
@@ -39,9 +39,11 @@
 #define __SYCL_TYPE(x)
 #endif
 
-// joint matrix should only be included by default for SPIR or NVPTX backends
-#if defined __SPIR__ || defined __NVPTX__ || !defined __SYCL_DEVICE_ONLY__
+// joint matrix should only be included by default for SPIR, NVPTX or HIP(GFX90A
+// only) backends
+#if defined __SPIR__ || defined __NVPTX__ || !defined __SYCL_DEVICE_ONLY__ ||  \
+    defined __gfx90a__
 #ifndef SYCL_EXT_ONEAPI_MATRIX_VERSION
 #define SYCL_EXT_ONEAPI_MATRIX_VERSION 4
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
-#endif // __SPIR__ || __NVPTX__ || !__SYCL_DEVICE_ONLY
+#endif // __SPIR__ || __NVPTX__ || !__SYCL_DEVICE_ONLY || __gfx90a__
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
new file mode 100644
index 0000000000000..7f9f9b1219cf4
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
@@ -0,0 +1,404 @@
+
+//===-------- matrix-hip.hpp - matrix ext impl ---*- C++ -*-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===-------------------------------------------------------------------=== //
+
+#pragma once
+#include "matrix-unified-utils.hpp"
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+#define __HIP_PLATFORM_AMD_MFMA__
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace detail {
+
+constexpr int WAVEFRONT_SIZE = 64;
+
+template <typename T, sycl::ext::oneapi::experimental::matrix::use Use,
+          size_t Rows, size_t Cols,
+          sycl::ext::oneapi::experimental::matrix::layout Layout =
+              sycl::ext::oneapi::experimental::matrix::layout::dynamic,
+          typename Cond = void>
+struct joint_matrix_hip;
+
+using bfloat16x4 = __attribute__((__vector_size__(4 * sizeof(__bf16)))) __fp16;
+using float16x4 = __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16;
+using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+using int32x4 = __attribute__((__vector_size__(4 * sizeof(int32_t)))) int;
+using int32x16 = __attribute__((__vector_size__(16 * sizeof(int32_t)))) int;
+using doublex4 = __attribute__((__vector_size__(4 * sizeof(double)))) double;
+
+template <typename T> struct to_hip_type {
+  using type = T;
+};
+
+template <> struct to_hip_type<bfloat16> {
+  using type = __bf16;
+};
+
+template <> struct to_hip_type<half> {
+  using type = __fp16;
+};
+
+template <> struct to_hip_type<int8_t> {
+  using type = int32_t;
+};
+
+#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR
+
+#define __SYCL_JOINT_MATRIX_OVERLOAD_ARR(TYPE, USE, M, N, SIZE)                \
+  template <sycl::ext::oneapi::experimental::matrix::layout Layout>            \
+  struct joint_matrix_hip<                                                     \
+      TYPE, sycl::ext::oneapi::experimental::matrix::use::USE, M, N, Layout,   \
+      typename std::enable_if_t<                                               \
+          Layout ==                                                            \
+              sycl::ext::oneapi::experimental::matrix::layout::row_major ||    \
+          Layout ==                                                            \
+              sycl::ext::oneapi::experimental::matrix::layout::col_major>> {   \
+    sycl::marray<TYPE, SIZE> wi_marray;                                        \
+  };
+
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, a, 16, 16, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, b, 16, 16, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, a, 32, 8, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(bfloat16, b, 8, 32, 4)
+
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, a, 16, 16, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, b, 16, 16, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, a, 32, 8, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(half, b, 8, 32, 4)
+
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(double, a, 16, 4, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(double, b, 4, 16, 1)
+
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, a, 32, 8, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, b, 8, 32, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, a, 16, 16, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR(int8_t, b, 16, 16, 4)
+
+#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR
+
+#define __SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(TYPE, M, N)                       \
+  template <>                                                                  \
+  struct joint_matrix_hip<                                                     \
+      TYPE, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,   \
+      sycl::ext::oneapi::experimental::matrix::layout::dynamic> {              \
+    sycl::marray<TYPE, (M * N) / WAVEFRONT_SIZE> wi_marray;                    \
+  };
+
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(float, 16, 16)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(float, 32, 32)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(double, 16, 16)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(int32_t, 32, 32)
+__SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC(int32_t, 16, 16)
+
+#undef __SYCL_JOINT_MATRIX_OVERLOAD_ARR_ACC
+
+template <sycl::ext::oneapi::experimental::matrix::layout Layout, typename S,
+          typename T, size_t M, size_t N, access::address_space Space,
+          access::decorated IsDecorated, typename Group>
+void load_accumulator_layoutT(
+    joint_matrix_hip<
+        S, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
+    multi_ptr<T, Space, IsDecorated> src, size_t stride, Group &sg) {
+  const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] +
+                   sg.get_local_linear_id();
+
+  if constexpr (std::is_same_v<S, double>) {
+    const auto thread_x = idx % N;
+    const auto thread_y = idx / N;
+
+    if constexpr (Layout ==
+                  sycl::ext::oneapi::experimental::matrix::layout::row_major) {
+      for (int i = 0; i < 4; ++i) {
+        const int s_idx = thread_x + i * 4 * stride + thread_y * stride;
+        res.wi_marray[i] = src[s_idx];
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        const int s_idx = i * 4 + thread_x * stride + thread_y;
+        res.wi_marray[i] = src[s_idx];
+      }
+    }
+  } else if constexpr (std::is_same_v<S, float> || std::is_same_v<S, int32_t>) {
+    if constexpr (M == 16 && N == 16) {
+      const auto thread_x = idx % N;
+      const auto thread_y = idx / N;
+
+      if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout::
+                                  row_major) {
+        for (int i = 0; i < 4; ++i) {
+          const int s_idx = thread_x + i * stride + thread_y * 4 * stride;
+          res.wi_marray[i] = src[s_idx];
+        }
+      } else {
+        for (int i = 0; i < 4; ++i) {
+          const int s_idx = i + thread_x * stride + thread_y * 4;
+          res.wi_marray[i] = src[s_idx];
+        }
+      }
+    } else if constexpr (M == 32 && N == 32) {
+      const auto thread_x = idx % N;
+      const auto thread_y = idx / N;
+
+      if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout::
+                                  row_major) {
+        for (int j = 0; j < 4; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int s_idx =
+                thread_x + i * stride + thread_y * 4 * stride + j * 8 * N;
+            res.wi_marray[i + 4 * j] = src[s_idx];
+          }
+        }
+      } else {
+        for (int j = 0; j < 4; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int s_idx = i + thread_x * stride + thread_y * 4 + j * 8;
+            res.wi_marray[i + 4 * j] = src[s_idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename Group, typename S, typename T, size_t M, size_t N,
+    access::address_space Space, access::decorated IsDecorated,
+    typename = std::enable_if_t<std::is_same_v<S, std::remove_const_t<T>>>>
+void load_accumulator_hip(
+    joint_matrix_hip<
+        S, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
+    multi_ptr<T, Space, IsDecorated> src, size_t stride,
+    sycl::ext::oneapi::experimental::matrix::layout layout, Group &sg) {
+  if (layout == sycl::ext::oneapi::experimental::matrix::layout::row_major)
+    load_accumulator_layoutT<
+        sycl::ext::oneapi::experimental::matrix::layout::row_major>(res, src,
+                                                                    stride, sg);
+  else
+    load_accumulator_layoutT<
+        sycl::ext::oneapi::experimental::matrix::layout::col_major>(res, src,
+                                                                    stride, sg);
+}
+
+template <
+    typename Group, typename S, typename T, size_t M, size_t N,
+    sycl::ext::oneapi::experimental::matrix::use Use,
+    sycl::ext::oneapi::experimental::matrix::layout Layout,
+    access::address_space Space, access::decorated IsDecorated,
+    typename = typename std::enable_if_t<
+        (Layout == sycl::ext::oneapi::experimental::matrix::layout::row_major ||
+         Layout ==
+             sycl::ext::oneapi::experimental::matrix::layout::col_major) &&
+        std::is_same_v<S, std::remove_const_t<T>>>>
+void load_multiplicand_hip(joint_matrix_hip<S, Use, M, N, Layout> &res,
+                           multi_ptr<T, Space, IsDecorated> src, size_t stride,
+                           Group &sg) {
+  const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] +
+                   sg.get_local_linear_id();
+
+  if constexpr (std::is_same_v<S, double>) {
+    if constexpr (Layout ==
+                  sycl::ext::oneapi::experimental::matrix::layout::row_major) {
+      res.wi_marray[0] = src[idx];
+    } else {
+      res.wi_marray[0] = src[(idx % M) * stride + idx / M];
+    }
+  } else {
+    constexpr int Dim = (M == 16) ? 16 : 32;
+
+    const auto thread_x = idx % Dim;
+    const auto thread_y = idx / Dim;
+
+    if constexpr (Layout ==
+                  sycl::ext::oneapi::experimental::matrix::layout::col_major) {
+      for (int i = 0; i < 4; ++i) {
+        const int c_idx = thread_x * stride + i + thread_y * 4;
+        res.wi_marray[i] = src[c_idx];
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        const int r_idx = thread_x + i * stride + thread_y * stride * 4;
+        res.wi_marray[i] = src[r_idx];
+      }
+    }
+  }
+}
+
+template <typename Group,
+          sycl::ext::oneapi::experimental::matrix::layout Layout, typename T,
+          size_t M, size_t N, access::address_space Space,
+          access::decorated IsDecorated>
+void store_layoutT(
+    const joint_matrix_hip<
+        T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src,
+    multi_ptr<T, Space, IsDecorated> dst, size_t stride, Group &sg) {
+  const auto idx = sg.get_group_linear_id() * sg.get_local_range()[0] +
+                   sg.get_local_linear_id();
+
+  if constexpr (std::is_same_v<T, double>) {
+    const auto thread_x = idx % N;
+    const auto thread_y = idx / N;
+
+    if constexpr (Layout ==
+                  sycl::ext::oneapi::experimental::matrix::layout::row_major) {
+      for (int i = 0; i < 4; ++i) {
+        const int d_idx = thread_x + i * 4 * stride + thread_y * stride;
+        dst[d_idx] = src.wi_marray[i];
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        const int d_idx = i * 4 + thread_x * stride + thread_y;
+        dst[d_idx] = src.wi_marray[i];
+      }
+    }
+  } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int32_t>) {
+    if constexpr (M == 16 && N == 16) {
+      const auto thread_x = idx % N;
+      const auto thread_y = idx / N;
+
+      if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout::
+                                  row_major) {
+        for (int i = 0; i < 4; ++i) {
+          const int d_idx = thread_x + i * stride + thread_y * 4 * stride;
+          dst[d_idx] = src.wi_marray[i];
+        }
+      } else {
+        for (int i = 0; i < 4; ++i) {
+          const int d_idx = i + thread_x * stride + thread_y * 4;
+          dst[d_idx] = src.wi_marray[i];
+        }
+      }
+    } else if constexpr (M == 32 && N == 32) {
+      const auto thread_x = idx % N;
+      const auto thread_y = idx / N;
+
+      if constexpr (Layout == sycl::ext::oneapi::experimental::matrix::layout::
+                                  row_major) {
+        for (int j = 0; j < 4; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int d_idx =
+                thread_x + i * stride + thread_y * 4 * stride + j * 8 * stride;
+            dst[d_idx] = src.wi_marray[i + 4 * j];
+          }
+        }
+      } else {
+        for (int j = 0; j < 4; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int d_idx = i + thread_x * stride + thread_y * 4 + j * 8;
+            dst[d_idx] = src.wi_marray[i + 4 * j];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Group, typename T, size_t M, size_t N,
+          access::address_space Space, access::decorated IsDecorated>
+void joint_matrix_store_hip(
+    const joint_matrix_hip<
+        T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src,
+    multi_ptr<T, Space, IsDecorated> dst, size_t stride,
+    sycl::ext::oneapi::experimental::matrix::layout layout, Group &sg) {
+  if (sycl::ext::oneapi::experimental::matrix::layout::row_major == layout) {
+    store_layoutT<Group,
+                  sycl::ext::oneapi::experimental::matrix::layout::row_major>(
+        src, dst, stride, sg);
+  } else {
+    store_layoutT<Group,
+                  sycl::ext::oneapi::experimental::matrix::layout::col_major>(
+        src, dst, stride, sg);
+  }
+}
+
+template <typename Tm, typename Tc, std::size_t M, std::size_t K, std::size_t N,
+          sycl::ext::oneapi::experimental::matrix::layout LayoutA,
+          sycl::ext::oneapi::experimental::matrix::layout LayoutB>
+void joint_matrix_mad_hip(
+    joint_matrix_hip<
+        Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &D,
+    const joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::a,
+                           M, K, LayoutA> &A,
+    const joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::b,
+                           K, N, LayoutB> &B,
+    const joint_matrix_hip<
+        Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
+        sycl::ext::oneapi::experimental::matrix::layout::dynamic> &C) {
+  if constexpr (std::is_same_v<Tm, sycl::half>) {
+    if constexpr (M == 16 && N == 16) {
+      auto result = __builtin_amdgcn_mfma_f32_16x16x16f16(
+          *reinterpret_cast<const float16x4 *>(&A.wi_marray),
+          *reinterpret_cast<const float16x4 *>(&B.wi_marray),
+          *reinterpret_cast<const floatx4 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 4; ++i)
+        D.wi_marray[i] = result[i];
+    } else if constexpr (M == 32 && N == 32) {
+      auto result = __builtin_amdgcn_mfma_f32_32x32x8f16(
+          *reinterpret_cast<const float16x4 *>(&A.wi_marray),
+          *reinterpret_cast<const float16x4 *>(&B.wi_marray),
+          *reinterpret_cast<const floatx16 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 16; ++i)
+        D.wi_marray[i] = result[i];
+    }
+  } else if constexpr (std::is_same_v<Tm, bfloat16>) {
+    if constexpr (M == 16 && N == 16) {
+      auto result = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+          *reinterpret_cast<const bfloat16x4 *>(&A.wi_marray),
+          *reinterpret_cast<const bfloat16x4 *>(&B.wi_marray),
+          *reinterpret_cast<const floatx4 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 4; ++i)
+        D.wi_marray[i] = result[i];
+    } else if constexpr (M == 32 && N == 32) {
+      auto result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+          *reinterpret_cast<const bfloat16x4 *>(&A.wi_marray),
+          *reinterpret_cast<const bfloat16x4 *>(&B.wi_marray),
+          *reinterpret_cast<const floatx16 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 16; ++i)
+        D.wi_marray[i] = result[i];
+    }
+  } else if constexpr (std::is_same_v<Tm, double>) {
+    if constexpr (M == 16 && N == 16) {
+      auto result = __builtin_amdgcn_mfma_f64_16x16x4f64(
+          A.wi_marray[0], B.wi_marray[0],
+          *reinterpret_cast<const doublex4 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 4; ++i)
+        D.wi_marray[i] = result[i];
+    }
+  } else if constexpr (std::is_same_v<Tm, int8_t>) {
+    if constexpr (M == 16 && N == 16) {
+      auto result = __builtin_amdgcn_mfma_i32_16x16x16i8(
+          *reinterpret_cast<const Tc *>(&A.wi_marray),
+          *reinterpret_cast<const Tc *>(&B.wi_marray),
+          *reinterpret_cast<const int32x4 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 4; ++i)
+        D.wi_marray[i] = result[i];
+    } else if constexpr (M == 32 && N == 32) {
+      auto result = __builtin_amdgcn_mfma_i32_32x32x8i8(
+          *reinterpret_cast<const Tc *>(&A.wi_marray),
+          *reinterpret_cast<const Tc *>(&B.wi_marray),
+          *reinterpret_cast<const int32x16 *>(&C.wi_marray), 0, 0, 0);
+      for (int i = 0; i < 16; ++i)
+        D.wi_marray[i] = result[i];
+    }
+  }
+}
+
+} // namespace detail
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
index b852e3f1ff3f5..e1efa463939dd 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
@@ -533,8 +533,8 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_apply(
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
   std::ignore = sg;
-  for (int i = 0; i < jm.cuda_impl.wi_marray.size(); i++) {
-    lambda(jm.cuda_impl.wi_marray[i]);
+  for (int i = 0; i < jm.matrix_impl.wi_marray.size(); i++) {
+    lambda(jm.matrix_impl.wi_marray[i]);
   }
 #else // NVPTX
   using storage_element_type =
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index 3d128633c0735..d8a751680900a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -10,9 +10,13 @@
 
 #include "matrix-intel.hpp"
 
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
 #include "matrix-tensorcores.hpp"
-#endif
+#elif defined(__gfx90a__)
+#include "matrix-hip.hpp"
+#endif // defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__)
 
 #include <sycl/access/access.hpp>             // for address_space
 #include <sycl/detail/defines_elementary.hpp> // for __SYCL_ALWAYS_...
@@ -42,15 +46,17 @@ struct joint_matrix {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
   sycl::ext::oneapi::detail::joint_matrix_cuda<T, Use, Rows, Cols, Layout>
-      cuda_impl;
+      matrix_impl;
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  sycl::ext::oneapi::detail::joint_matrix_hip<T, Use, Rows, Cols, Layout>
+      matrix_impl;
 #elif defined(__SPIR__)
   __spv::__spirv_JointMatrixINTEL<
       T, Rows, Cols, spv_matrix_layout_traits<Layout>::value,
       spv_scope_traits<Group>::value, spv_matrix_use_traits<Use>::value> *spvm;
 #else
-  static_assert(
-      false,
-      "The joint_matrix API is only supported by the Intel and CUDA backends");
+  static_assert(false, "The joint_matrix API is only supported by the Intel, "
+                       "CUDA and HIP (GFX90A) backends");
 #endif // defined(__NVPTX__)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
@@ -81,10 +87,10 @@ inline __SYCL_ALWAYS_INLINE void
 joint_matrix_apply(Group sg, joint_matrix<Group, T, Use, M, N, Layout> &jm,
                    F &&lambda) {
 #if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
+#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__)
   std::ignore = sg;
-  for (int i = 0; i < jm.cuda_impl.wi_marray.size(); i++) {
-    lambda(jm.cuda_impl.wi_marray[i]);
+  for (int i = 0; i < jm.matrix_impl.wi_marray.size(); i++) {
+    lambda(jm.matrix_impl.wi_marray[i]);
   }
 #else // NVPTX
   using storage_element_type =
@@ -114,8 +120,8 @@ joint_matrix_fill(Group,
                   joint_matrix<Group, T, Use, NumRows, NumCols, Layout> &res,
                   const T2 &v) {
 #if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
-  res.cuda_impl.wi_marray = v;
+#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__)
+  res.matrix_impl.wi_marray = v;
 #else
   using storage_element_type =
       typename oneapi::detail::jm_type_interpretation_helper_trait<
@@ -140,7 +146,7 @@ template <
     std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value, bool> =
         true>
 inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
-    Group,
+    Group &sg,
     joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
                  sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
     multi_ptr<T, Space, IsDecorated> src, size_t stride,
@@ -149,9 +155,14 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support load from private memory!");
 #if defined(__NVPTX__)
-  sycl::ext::oneapi::detail::load_accumulator_cuda(res.cuda_impl, src, stride,
+  std::ignore = sg;
+  sycl::ext::oneapi::detail::load_accumulator_cuda(res.matrix_impl, src, stride,
                                                    Layout);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  sycl::ext::oneapi::detail::load_accumulator_hip(res.matrix_impl, src, stride,
+                                                  Layout, sg);
 #else
+  std::ignore = sg;
   using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
   DecorT *Ptr = sycl::detail::getDecorated<DecorT>(src);
   switch (Layout) {
@@ -184,6 +195,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
   }
 #endif // defined(__NVPTX__)
 #else
+  std::ignore = sg;
   std::ignore = res;
   std::ignore = src;
   std::ignore = stride;
@@ -202,17 +214,23 @@ template <
                           std::is_same<std::remove_const_t<T>, float>::value),
                      bool> = true>
 inline __SYCL_ALWAYS_INLINE void
-joint_matrix_load(Group,
+joint_matrix_load(Group &sg,
                   joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &res,
                   multi_ptr<T, Space, IsDecorated> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__)
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support load from private memory!");
 #if defined(__NVPTX__)
+  std::ignore = sg;
   sycl::ext::oneapi::detail::load_multiplicand_cuda<S, T, NumRows, NumCols, Use,
                                                     Layout, Space>(
-      res.cuda_impl, src, stride);
+      res.matrix_impl, src, stride);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  sycl::ext::oneapi::detail::load_multiplicand_hip<Group, S, T, NumRows,
+                                                   NumCols, Use, Layout, Space>(
+      res.matrix_impl, src, stride, sg);
 #else
+  std::ignore = sg;
   using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
   DecorT *Ptr = sycl::detail::getDecorated<DecorT>(src);
   res.spvm =
@@ -223,6 +241,7 @@ joint_matrix_load(Group,
           spv_scope_traits<Group>::value);
 #endif // defined(__NVPTX__)
 #else
+  std::ignore = sg;
   std::ignore = res;
   std::ignore = src;
   std::ignore = stride;
@@ -234,7 +253,7 @@ joint_matrix_load(Group,
 template <typename Group, typename T, size_t NumRows, size_t NumCols,
           access::address_space Space, access::decorated IsDecorated>
 inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
-    Group,
+    Group &sg,
     const joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
                        sycl::ext::oneapi::experimental::matrix::layout::dynamic>
         &src,
@@ -244,10 +263,16 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support store to private memory!");
 #if defined(__NVPTX__)
+  std::ignore = sg;
   sycl::ext::oneapi::detail::joint_matrix_store_cuda<T, NumRows, NumCols,
-                                                     Space>(src.cuda_impl, dst,
-                                                            stride, Layout);
+                                                     Space>(
+      src.matrix_impl, dst, stride, Layout);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  sycl::ext::oneapi::detail::joint_matrix_store_hip<Group, T, NumRows, NumCols,
+                                                    Space>(src.matrix_impl, dst,
+                                                           stride, Layout, sg);
 #else
+  std::ignore = sg;
   using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
   DecorT *Ptr = sycl::detail::getDecorated<DecorT>(dst);
   switch (Layout) {
@@ -280,6 +305,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
   }
 #endif // defined(__NVPTX__)
 #else
+  std::ignore = sg;
   std::ignore = src;
   std::ignore = dst;
   std::ignore = stride;
@@ -318,11 +344,20 @@ joint_matrix_mad(
   if constexpr (std::is_same<Ta, Tb>::value) {
     sycl::ext::oneapi::detail::joint_matrix_mad_cuda<Ta, Tc, Td, M, K, N,
                                                      LayoutA, LayoutB>(
-        D.cuda_impl, A.cuda_impl, B.cuda_impl, C.cuda_impl);
+        D.matrix_impl, A.matrix_impl, B.matrix_impl, C.matrix_impl);
   } else {
     assert(false && "Ta != Tb : In the CUDA backend joint_matrix_mad "
                     "requires that joint_matrix data types Ta and Tb match");
   }
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  if constexpr (std::is_same<Ta, Tb>::value) {
+    sycl::ext::oneapi::detail::joint_matrix_mad_hip<Ta, Tc, M, K, N, LayoutA,
+                                                    LayoutB>(
+        D.matrix_impl, A.matrix_impl, B.matrix_impl, C.matrix_impl);
+  } else {
+    assert(false && "Ta != Tb : In the HIP backend joint_matrix_mad "
+                    "requires that joint_matrix data types Ta and Tb match");
+  }
 #else
   if constexpr (std::is_same<Ta, uint16_t>::value &&
                 std::is_same<Tb, uint16_t>::value &&
@@ -353,11 +388,9 @@ void joint_matrix_copy(
     Group sg, joint_matrix<Group, T1, Use1, Rows, Cols, Layout1> &src,
     joint_matrix<Group, T2, Use2, Rows, Cols, Layout2> &dst) {
 #if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
+#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__)
   std::ignore = sg;
-  for (int i = 0; i < src.cuda_impl.wi_marray.size(); i++) {
-    dst.cuda_impl.wi_marray[i] = src.cuda_impl.wi_marray[i];
-  }
+  dst.matrix_impl.wi_marray = src.matrix_impl.wi_marray;
 #else
   using storage_element_type =
       typename oneapi::detail::jm_type_interpretation_helper_trait<
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp
new file mode 100644
index 0000000000000..296128915a136
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp
@@ -0,0 +1,99 @@
+
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::bfloat16;
+
+template <typename InType, typename OutType, size_t M, size_t N, size_t K>
+void hip_matrix_apply() {
+  InType A[M * K];
+  InType B[K * N];
+  OutType C[M * N];
+  OutType D[M * N];
+  OutType E[M * N];
+
+  for (auto i = 0; i < M * K; ++i) {
+    A[i] = 1;
+  }
+
+  for (auto i = 0; i < K * N; ++i) {
+    B[i] = 2;
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    D[i] = 0;
+    C[i] = 3;
+    E[i] = 3;
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{A, sycl::range{M * K}};
+    auto bufB = sycl::buffer{B, sycl::range{K * N}};
+    auto bufC = sycl::buffer{C, sycl::range{M * N}};
+    auto bufD = sycl::buffer{D, sycl::range{M * N}};
+
+    defaultQueue
+        .submit([&](sycl::handler &cgh) {
+          sycl::accessor accA{bufA, cgh, sycl::read_write};
+          sycl::accessor accB{bufB, cgh, sycl::read_write};
+          sycl::accessor accC{bufC, cgh, sycl::read_only};
+          sycl::accessor accD{bufD, cgh, sycl::write_only};
+
+          cgh.parallel_for(
+              sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) {
+                auto sg = idx.get_sub_group();
+                joint_matrix<sub_group, OutType, use::accumulator, M, N> sub_c;
+                joint_matrix<sub_group, InType, use::b, K, N, layout::row_major>
+                    sub_b;
+                joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
+                    sub_a;
+
+                joint_matrix_load(
+                    sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+                joint_matrix_load(
+                    sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+                joint_matrix_load(
+                    sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+
+                joint_matrix_apply(sg, sub_a, [=](InType &v) { v *= 2; });
+                joint_matrix_apply(sg, sub_b, [=](InType &v) { v *= 3; });
+                joint_matrix_apply(sg, sub_c, [=](OutType &v) { v *= 4; });
+
+                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+
+                joint_matrix_store(
+                    sg, sub_c,
+                    accD.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+              });
+        })
+        .wait();
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception &e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      E[m * N + n] *= 4;
+      for (int k = 0; k < K; k++) {
+        E[m * N + n] += A[m * K + k] * 2 * B[k * N + n] * 3;
+      }
+    }
+  }
+
+  for (int i = 0; i < M * N; ++i) {
+    assert(D[i] == E[i] && "Unexpected difference");
+  }
+};
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp
new file mode 100644
index 0000000000000..b1ed65bc8fc93
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_copy.hpp
@@ -0,0 +1,110 @@
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::bfloat16;
+
+template <typename InType, typename OutType, size_t M, size_t N, size_t K,
+          layout OutLayout>
+void hip_matrix_copy() {
+  InType A[M * K];
+  InType B[K * N];
+  OutType C[M * N];
+  OutType D[M * N];
+  OutType E[M * N];
+
+  std::mt19937 gen(0);
+  std::uniform_real_distribution<float> dist(-10, 10);
+
+  for (auto i = 0; i < M * K; ++i) {
+    A[i] = static_cast<InType>(dist(gen));
+  }
+
+  for (auto i = 0; i < K * N; ++i) {
+    B[i] = static_cast<InType>(dist(gen));
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    D[i] = 0;
+    C[i] = static_cast<OutType>(dist(gen));
+    if (OutLayout == layout::row_major)
+      E[i] = C[i];
+    else
+      E[(i % N) * M + int(i / M)] = C[i];
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{A, sycl::range{M * K}};
+    auto bufB = sycl::buffer{B, sycl::range{K * N}};
+    auto bufC = sycl::buffer{C, sycl::range{M * N}};
+    auto bufD = sycl::buffer{D, sycl::range{M * N}};
+
+    defaultQueue
+        .submit([&](sycl::handler &cgh) {
+          sycl::accessor accA{bufA, cgh, sycl::read_only};
+          sycl::accessor accB{bufB, cgh, sycl::read_only};
+          sycl::accessor accC{bufC, cgh, sycl::read_only};
+          sycl::accessor accD{bufD, cgh, sycl::write_only};
+
+          cgh.parallel_for(
+              sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) {
+                auto sg = idx.get_sub_group();
+                joint_matrix<sub_group, OutType, use::accumulator, M, N> sub_c,
+                    sub_c_copy;
+                joint_matrix<sub_group, InType, use::b, K, N, layout::row_major>
+                    sub_b, sub_b_copy;
+                joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
+                    sub_a, sub_a_copy;
+
+                joint_matrix_load(
+                    sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+                joint_matrix_load(
+                    sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+                joint_matrix_load(
+                    sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+
+                joint_matrix_copy(sg, sub_c, sub_c_copy);
+                joint_matrix_copy(sg, sub_a, sub_a_copy);
+                joint_matrix_copy(sg, sub_b, sub_b_copy);
+
+                joint_matrix_mad(sg, sub_c_copy, sub_a_copy, sub_b_copy,
+                                 sub_c_copy);
+
+                joint_matrix_store(
+                    sg, sub_c_copy,
+                    accD.template get_multi_ptr<access::decorated::yes>(), N,
+                    OutLayout);
+              });
+        })
+        .wait();
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception &e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  for (auto m = 0; m < M; m++) {
+    for (auto n = 0; n < N; n++) {
+      for (auto k = 0; k < K; k++) {
+        if (OutLayout == layout::row_major)
+          E[m * N + n] += A[m * K + k] * B[k * N + n];
+        else
+          E[n * M + m] += A[m * K + k] * B[k * N + n];
+      }
+    }
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    assert(abs(D[i] - E[i]) < 100 && "Unexpected difference");
+  }
+};
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp
new file mode 100644
index 0000000000000..642562aed4de9
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp
@@ -0,0 +1,87 @@
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::bfloat16;
+
+template <typename InType, typename OutType, size_t M, size_t N, size_t K>
+void hip_matrix_fill() {
+  InType A[M * K];
+  InType B[K * N];
+  OutType C[M * N];
+  OutType D[M * N];
+  OutType E[M * N];
+
+  for (auto i = 0; i < M * K; ++i) {
+    A[i] = 1;
+  }
+
+  for (auto i = 0; i < K * N; ++i) {
+    B[i] = 2;
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    D[i] = 0;
+    C[i] = 3;
+    E[i] = 3;
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{A, sycl::range{M * K}};
+    auto bufB = sycl::buffer{B, sycl::range{K * N}};
+    auto bufC = sycl::buffer{C, sycl::range{M * N}};
+    auto bufD = sycl::buffer{D, sycl::range{M * N}};
+
+    defaultQueue
+        .submit([&](sycl::handler &cgh) {
+          sycl::accessor accA{bufA, cgh, sycl::read_only};
+          sycl::accessor accB{bufB, cgh, sycl::read_only};
+          sycl::accessor accC{bufC, cgh, sycl::read_only};
+          sycl::accessor accD{bufD, cgh, sycl::write_only};
+
+          cgh.parallel_for(
+              sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) {
+                auto sg = idx.get_sub_group();
+                joint_matrix<sub_group, OutType, use::accumulator, M, N>
+                    sub_c{};
+                joint_matrix<sub_group, InType, use::b, K, N, layout::row_major>
+                    sub_b{};
+                joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
+                    sub_a{};
+
+                joint_matrix_fill(sg, sub_a, 1);
+                joint_matrix_fill(sg, sub_b, 2);
+                joint_matrix_fill(sg, sub_c, 3);
+
+                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+
+                joint_matrix_store(
+                    sg, sub_c,
+                    accD.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+              });
+        })
+        .wait();
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception &e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  for (auto m = 0; m < M; m++) {
+    for (auto n = 0; n < N; n++) {
+      for (auto k = 0; k < K; k++) {
+        E[m * N + n] += A[m * K + k] * B[k * N + n];
+      }
+    }
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    assert(D[i] == E[i] && "Unexpected difference");
+  }
+};
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp
new file mode 100644
index 0000000000000..e9dc8659e69ae
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp
@@ -0,0 +1,52 @@
+// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out
+// RUN: %{run} %t.out
+
+// REQUIRES: gpu-amd-gfx90a
+
+#include "joint_matrix_hip_apply.hpp"
+#include "joint_matrix_hip_copy.hpp"
+#include "joint_matrix_hip_fill.hpp"
+#include "joint_matrix_hip_mfma.hpp"
+
+template <size_t KX> void matrix_mfma() {
+  hip_matrix_mfma<int8_t, int32_t, 32, 32, 8, KX, layout::row_major>();
+  hip_matrix_mfma<int8_t, int32_t, 16, 16, 16, KX, layout::row_major>();
+  hip_matrix_mfma<bfloat16, float, 32, 32, 8, KX, layout::row_major>();
+  hip_matrix_mfma<bfloat16, float, 16, 16, 16, KX, layout::row_major>();
+  hip_matrix_mfma<double, double, 16, 16, 4, KX, layout::row_major>();
+  hip_matrix_mfma<int8_t, int32_t, 32, 32, 8, KX, layout::col_major>();
+  hip_matrix_mfma<int8_t, int32_t, 16, 16, 16, KX, layout::col_major>();
+  hip_matrix_mfma<bfloat16, float, 32, 32, 8, KX, layout::col_major>();
+  hip_matrix_mfma<bfloat16, float, 16, 16, 16, KX, layout::col_major>();
+  hip_matrix_mfma<double, double, 16, 16, 4, KX, layout::col_major>();
+}
+
+int main() {
+  matrix_mfma<1>();
+  matrix_mfma<2>();
+  matrix_mfma<3>();
+  matrix_mfma<4>();
+
+  hip_matrix_copy<int8_t, int32_t, 32, 32, 8, layout::row_major>();
+  hip_matrix_copy<int8_t, int32_t, 16, 16, 16, layout::row_major>();
+  hip_matrix_copy<bfloat16, float, 32, 32, 8, layout::row_major>();
+  hip_matrix_copy<bfloat16, float, 16, 16, 16, layout::row_major>();
+  hip_matrix_copy<double, double, 16, 16, 4, layout::row_major>();
+  hip_matrix_copy<int8_t, int32_t, 32, 32, 8, layout::col_major>();
+  hip_matrix_copy<int8_t, int32_t, 16, 16, 16, layout::col_major>();
+  hip_matrix_copy<bfloat16, float, 32, 32, 8, layout::col_major>();
+  hip_matrix_copy<bfloat16, float, 16, 16, 16, layout::col_major>();
+  hip_matrix_copy<double, double, 16, 16, 4, layout::col_major>();
+
+  hip_matrix_fill<int8_t, int32_t, 32, 32, 8>();
+  hip_matrix_fill<int8_t, int32_t, 16, 16, 16>();
+  hip_matrix_fill<bfloat16, float, 32, 32, 8>();
+  hip_matrix_fill<bfloat16, float, 16, 16, 16>();
+  hip_matrix_fill<double, double, 16, 16, 4>();
+
+  hip_matrix_apply<int8_t, int32_t, 32, 32, 8>();
+  hip_matrix_apply<int8_t, int32_t, 16, 16, 16>();
+  hip_matrix_apply<bfloat16, float, 32, 32, 8>();
+  hip_matrix_apply<bfloat16, float, 16, 16, 16>();
+  hip_matrix_apply<double, double, 16, 16, 4>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp
new file mode 100644
index 0000000000000..96aacbac9c280
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp
@@ -0,0 +1,35 @@
+// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out
+// RUN: %{run} %t.out
+
+// REQUIRES: gpu-amd-gfx90a
+// REQUIRES: aspect-fp16
+
+#include "joint_matrix_hip_apply.hpp"
+#include "joint_matrix_hip_copy.hpp"
+#include "joint_matrix_hip_fill.hpp"
+#include "joint_matrix_hip_mfma.hpp"
+
+template <size_t KX> void half_matrix_mfma() {
+  hip_matrix_mfma<sycl::half, float, 32, 32, 8, KX, layout::row_major>();
+  hip_matrix_mfma<sycl::half, float, 16, 16, 16, KX, layout::row_major>();
+  hip_matrix_mfma<sycl::half, float, 32, 32, 8, KX, layout::col_major>();
+  hip_matrix_mfma<sycl::half, float, 16, 16, 16, KX, layout::col_major>();
+}
+
+int main() {
+  half_matrix_mfma<1>();
+  half_matrix_mfma<2>();
+  half_matrix_mfma<3>();
+  half_matrix_mfma<4>();
+
+  hip_matrix_copy<sycl::half, float, 32, 32, 8, layout::row_major>();
+  hip_matrix_copy<sycl::half, float, 16, 16, 16, layout::row_major>();
+  hip_matrix_copy<sycl::half, float, 32, 32, 8, layout::col_major>();
+  hip_matrix_copy<sycl::half, float, 16, 16, 16, layout::col_major>();
+
+  hip_matrix_fill<sycl::half, float, 32, 32, 8>();
+  hip_matrix_fill<sycl::half, float, 16, 16, 16>();
+
+  hip_matrix_apply<sycl::half, float, 32, 32, 8>();
+  hip_matrix_apply<sycl::half, float, 16, 16, 16>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp
new file mode 100644
index 0000000000000..650bcbaa0908b
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp
@@ -0,0 +1,115 @@
+
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::bfloat16;
+
+template <typename InType, typename OutType, size_t M, size_t N, size_t K,
+          size_t KX, layout OutLayout>
+void hip_matrix_mfma() {
+  InType A[M * K * KX];
+  InType B[K * N * KX];
+  OutType C[M * N];
+  OutType D[M * N];
+  OutType E[M * N];
+
+  std::mt19937 gen(0);
+  std::uniform_real_distribution<float> dist(-10, 10);
+
+  for (auto i = 0; i < M * K * KX; ++i) {
+    A[i] = static_cast<InType>(dist(gen));
+  }
+
+  for (auto i = 0; i < K * N * KX; ++i) {
+    B[i] = static_cast<InType>(dist(gen));
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    D[i] = 0;
+    C[i] = static_cast<OutType>(dist(gen));
+    if (OutLayout == layout::row_major)
+      E[i] = C[i];
+    else
+      E[(i % N) * M + int(i / M)] = C[i];
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{A, sycl::range{M * K * KX}};
+    auto bufB = sycl::buffer{B, sycl::range{K * N * KX}};
+    auto bufC = sycl::buffer{C, sycl::range{M * N}};
+    auto bufD = sycl::buffer{D, sycl::range{M * N}};
+
+    defaultQueue
+        .submit([&](sycl::handler &cgh) {
+          sycl::accessor accA{bufA, cgh, sycl::read_only};
+          sycl::accessor accB{bufB, cgh, sycl::read_only};
+          sycl::accessor accC{bufC, cgh, sycl::read_only};
+          sycl::accessor accD{bufD, cgh, sycl::write_only};
+
+          cgh.parallel_for(
+              sycl::nd_range<2>{{4, 16}, {4, 16}}, [=](sycl::nd_item<2> idx) {
+                auto sg = idx.get_sub_group();
+                joint_matrix<sub_group, OutType, use::accumulator, M, N> sub_c;
+                joint_matrix<sub_group, InType, use::b, K, N, layout::row_major>
+                    sub_b;
+                joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
+                    sub_a;
+
+                joint_matrix_load(
+                    sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+
+                for (auto kx = 0; kx < KX; ++kx) {
+                  joint_matrix_load(
+                      sg, sub_a,
+                      accA.template get_multi_ptr<access::decorated::yes>() +
+                          kx * K,
+                      K * KX);
+                  joint_matrix_load(
+                      sg, sub_b,
+                      accB.template get_multi_ptr<access::decorated::yes>() +
+                          kx * K * N,
+                      N);
+                  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+                }
+
+                joint_matrix_store(
+                    sg, sub_c,
+                    accD.template get_multi_ptr<access::decorated::yes>(), N,
+                    OutLayout);
+              });
+        })
+        .wait();
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception &e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  constexpr int LDA = K * KX;
+
+  for (auto m = 0; m < M; m++) {
+    for (auto n = 0; n < N; n++) {
+      OutType e = 0;
+      for (auto k = 0; k < LDA; k++) {
+        e += A[m * LDA + k] * B[k * N + n];
+      }
+      if (OutLayout == layout::row_major)
+        E[m * N + n] += e;
+      else
+        E[n * M + m] += e;
+    }
+  }
+
+  for (auto i = 0; i < M * N; ++i) {
+    assert(abs(D[i] - E[i]) < 100 && "Unexpected difference");
+  }
+};
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index 14c9dc4ba654a..a25323c6e50aa 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -275,6 +275,8 @@
     devices = set()
     sp = subprocess.check_output(sycl_ls, text=True)
     for line in sp.splitlines():
+        if "gfx90a" in line:
+            config.available_features.add("gpu-amd-gfx90a")
         (backend, device, _) = line[1:].split(':', 2)
         devices.add('{}:{}'.format(backend, device))
     config.sycl_devices = list(devices)
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp
new file mode 100644
index 0000000000000..9f3454d5ef83a
--- /dev/null
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp
@@ -0,0 +1,72 @@
+// REQUIRES: hip
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::bfloat16;
+
+int main() {
+
+  buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
+  buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
+  buffer<float, 1> bufC(nullptr, range<1>(1));
+  buffer<float, 1> bufD(nullptr, range<1>(1));
+
+  queue q;
+
+  q.submit([&](handler &cgh) {
+    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accA(bufA, cgh);
+    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accB(bufB, cgh);
+    sycl::accessor<float, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accC(bufC, cgh);
+    sycl::accessor<float, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accD(bufD, cgh);
+
+    cgh.parallel_for<class row_row_m16n16k16>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
+              sub_b{};
+
+          // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %{{.*}}, <4 x i16> %{{.*}} <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              16, layout::row_major);
+        });
+
+    cgh.parallel_for<class row_col_m32n32k8>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
+          joint_matrix<sub_group, bfloat16, use::a, 32, 8, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major>
+              sub_b{};
+
+          // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> {{.*}}, <4 x i16> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              32, layout::row_major);
+        });
+  });
+
+  return 0;
+};
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp
new file mode 100644
index 0000000000000..8475afee205b7
--- /dev/null
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp
@@ -0,0 +1,53 @@
+// REQUIRES: hip
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+
+  buffer<double, 1> bufA(nullptr, range<1>(1));
+  buffer<double, 1> bufB(nullptr, range<1>(1));
+  buffer<double, 1> bufC(nullptr, range<1>(1));
+  buffer<double, 1> bufD(nullptr, range<1>(1));
+
+  queue q;
+
+  q.submit([&](handler &cgh) {
+    sycl::accessor<double, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accA(bufA, cgh);
+    sycl::accessor<double, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accB(bufB, cgh);
+    sycl::accessor<double, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accC(bufC, cgh);
+    sycl::accessor<double, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accD(bufD, cgh);
+
+    cgh.parallel_for<class row_row_m16n16k4>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, double, use::accumulator, 16, 16> sub_c{};
+          joint_matrix<sub_group, double, use::a, 16, 4, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major>
+              sub_b{};
+
+          // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %{{.*}}, double %{{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              16, layout::row_major);
+        });
+  });
+
+  return 0;
+};
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
new file mode 100644
index 0000000000000..9019233a1fa8b
--- /dev/null
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
@@ -0,0 +1,71 @@
+// REQUIRES: hip
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+
+  buffer<half, 1> bufA(nullptr, range<1>(1));
+  buffer<half, 1> bufB(nullptr, range<1>(1));
+  buffer<float, 1> bufC(nullptr, range<1>(1));
+  buffer<float, 1> bufD(nullptr, range<1>(1));
+
+  queue q;
+
+  q.submit([&](handler &cgh) {
+    sycl::accessor<half, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accA(bufA, cgh);
+    sycl::accessor<half, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accB(bufB, cgh);
+    sycl::accessor<float, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accC(bufC, cgh);
+    sycl::accessor<float, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accD(bufD, cgh);
+
+    cgh.parallel_for<class row_row_m16n16k16>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
+              sub_b{};
+
+          // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              16, layout::row_major);
+        });
+
+    cgh.parallel_for<class row_col_m32n32k8>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
+          joint_matrix<sub_group, half, use::a, 32, 8, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major>
+              sub_b{};
+
+          // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> {{.*}}, <4 x half> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              32, layout::row_major);
+        });
+  });
+
+  return 0;
+};
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp
new file mode 100644
index 0000000000000..f2c7b1ec8c08c
--- /dev/null
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp
@@ -0,0 +1,71 @@
+// REQUIRES: hip
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+
+  buffer<int8_t, 1> bufA(nullptr, range<1>(1));
+  buffer<int8_t, 1> bufB(nullptr, range<1>(1));
+  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
+  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
+
+  queue q;
+
+  q.submit([&](handler &cgh) {
+    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accA(bufA, cgh);
+    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accB(bufB, cgh);
+    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accC(bufC, cgh);
+    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                   sycl::target::device>
+        accD(bufD, cgh);
+
+    cgh.parallel_for<class row_row_m16n16k16>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major>
+              sub_b{};
+
+          // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              16, layout::row_major);
+        });
+
+    cgh.parallel_for<class row_col_m32n32k8>(
+        nd_range<2>({1, 64}, {1, 64}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<sub_group, int32_t, use::accumulator, 32, 32> sub_c{};
+          joint_matrix<sub_group, int8_t, use::a, 32, 8, layout::row_major>
+              sub_a{};
+          joint_matrix<sub_group, int8_t, use::b, 8, 32, layout::col_major>
+              sub_b{};
+
+          // CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          joint_matrix_store(
+              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
+              32, layout::row_major);
+        });
+  });
+
+  return 0;
+};