From 5b527930655232b3d18e92725be7c8c36134c92d Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Tue, 4 Jun 2024 18:15:06 +0200
Subject: [PATCH 01/55] [SYCL] Fix XPTI FW lib path for test-e2e on Win
 (#13986)

This PR helps to avoid hardcoded path symbols and let OS dependent tools
to work with paths.
Original version produces `path/....\lib\xptifw.lib` which causes skip
of library linkage on windows.
`os.path.normpath` is the main change here that helps to handle path
separators properly.

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test-e2e/lit.cfg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index 66496bf201119..effda4949e0a8 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -521,10 +521,11 @@
     config.available_features.add("xptifw")
     config.substitutions.append(("%xptifw_dispatcher", xptifw_dispatcher))
     if cl_options:
+        xptifw_lib_name = os.path.normpath(os.path.join(xptifw_lib_dir, "xptifw.lib"))
         config.substitutions.append(
             (
                 "%xptifw_lib",
-                " {}/xptifw.lib /I{} ".format(xptifw_lib_dir, xptifw_includes),
+                " {} /I{} ".format(xptifw_lib_name, xptifw_includes),
             )
         )
     else:

From 18c4fb2c57f3b937451becda4ca25468397128f5 Mon Sep 17 00:00:00 2001
From: Pietro Ghiglio <pietro.ghiglio@codeplay.com>
Date: Tue, 4 Jun 2024 18:37:40 +0200
Subject: [PATCH 02/55] [SYCL] [NATIVECPU] Report correct memory order
 capabilities for Native CPU (#13469)

Testing for https://github.com/oneapi-src/unified-runtime/pull/1527
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index d45cd4325e959..2b6d119859bbe 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -130,7 +130,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(native_cpu
     ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    # commit 31ee5d536130a6d2ce09661db1bef3bd1cd4d705
+    # Merge: ab151e98 53e43466
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Tue Jun 4 14:10:39 2024 +0100
+    #     Merge pull request #1527 from PietroGhg/pietro/report_atomics
+    #     [NATIVECPU] Report correct memory order capabilities for Native CPU
+    31ee5d536130a6d2ce09661db1bef3bd1cd4d705
   )
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)

From dce651bd69ea12c935c70990ed3290007a00c6c5 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 5 Jun 2024 08:46:58 +0100
Subject: [PATCH 03/55] [SYCL][COMPAT] Migrate bug fixes & refactor of
 get_*version APIs (#14011)

This PR migrates changes to `get_version`, `get_major_version` and
`get_minor_version`, adds new free function equivalents which accept a
`sycl::device` argument, and adds relevant tests and documentation.

---------

Signed-off-by: Joe Todd <joe.todd@codeplay.com>
---
 sycl/doc/syclcompat/README.md              |  7 ++
 sycl/include/syclcompat/device.hpp         | 79 +++++++++++++---------
 sycl/test-e2e/syclcompat/device/device.cpp | 55 ++++++++++++++-
 3 files changed, 108 insertions(+), 33 deletions(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 3e57851e768ef..6b776cfb777b3 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -924,6 +924,13 @@ class device_ext : public sycl::device {
 } // syclcompat
 ```
 
+Free functions are provided for querying major and minor version directly from a `sycl::device`, equivalent to the methods of `device_ext` described above:
+
+```c++
+static int get_major_version(const sycl::device &dev);
+static int get_minor_version(const sycl::device &dev);
+```
+
 #### Multiple devices
 
 SYCLcompat allows you to manage multiple devices through
diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp
index b8b09fbc07d87..399efbd8b8933 100644
--- a/sycl/include/syclcompat/device.hpp
+++ b/sycl/include/syclcompat/device.hpp
@@ -59,6 +59,38 @@
 namespace syclcompat {
 
 namespace detail {
+static void parse_version_string(const std::string &ver, int &major,
+                                 int &minor) {
+  // Version string has the following format:
+  // a. OpenCL<space><major.minor><space><vendor-specific-information>
+  // b. <major.minor>
+  // c. <AmdGcnArchName> e.g gfx1030
+  std::string::size_type i = 0;
+  while (i < ver.size()) {
+    if (isdigit(ver[i]))
+      break;
+    i++;
+  }
+  if (i < ver.size())
+    major = std::stoi(&(ver[i]));
+  else
+    major = 0;
+  while (i < ver.size()) {
+    if (ver[i] == '.')
+      break;
+    i++;
+  }
+  i++;
+  if (i < ver.size())
+    minor = std::stoi(&(ver[i]));
+  else
+    minor = 0;
+}
+
+static void get_version(const sycl::device &dev, int &major, int &minor) {
+  std::string ver = dev.get_info<sycl::info::device::version>();
+  parse_version_string(ver, major, minor);
+}
 
 /// SYCL default exception handler
 inline auto exception_handler = [](sycl::exception_list exceptions) {
@@ -288,6 +320,18 @@ class device_info {
   int _image3d_max[3];
 };
 
+static int get_major_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return major;
+}
+
+static int get_minor_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return minor;
+}
+
 /// device extension
 class device_ext : public sycl::device {
 public:
@@ -310,13 +354,9 @@ class device_ext : public sycl::device {
   }
 
   bool is_native_host_atomic_supported() { return false; }
-  int get_major_version() const {
-    return get_device_info().get_major_version();
-  }
+  int get_major_version() const { return syclcompat::get_major_version(*this); }
 
-  int get_minor_version() const {
-    return get_device_info().get_minor_version();
-  }
+  int get_minor_version() const { return syclcompat::get_minor_version(*this); }
 
   int get_max_compute_units() const {
     return get_device_info().get_max_compute_units();
@@ -618,32 +658,7 @@ Use 64 bits as memory_bus_width default value."
   }
 
   void get_version(int &major, int &minor) const {
-    // Version string has the following format:
-    // a. OpenCL<space><major.minor><space><vendor-specific-information>
-    // b. <major.minor>
-    // c. <AmdGcnArchName> e.g gfx1030
-    std::string ver;
-    ver = get_info<sycl::info::device::version>();
-    std::string::size_type i = 0;
-    while (i < ver.size()) {
-      if (isdigit(ver[i]))
-        break;
-      i++;
-    }
-    major = std::stoi(&(ver[i]));
-    while (i < ver.size()) {
-      if (ver[i] == '.')
-        break;
-      i++;
-    }
-    if (i < ver.size()) {
-      // a. and b.
-      i++;
-      minor = std::stoi(&(ver[i]));
-    } else {
-      // c.
-      minor = 0;
-    }
+    detail::get_version(*this, major, minor);
   }
   void add_event(sycl::event event) {
     std::lock_guard<std::mutex> lock(m_mutex);
diff --git a/sycl/test-e2e/syclcompat/device/device.cpp b/sycl/test-e2e/syclcompat/device/device.cpp
index 28d564d972e5e..9e4c8edcd91c9 100644
--- a/sycl/test-e2e/syclcompat/device/device.cpp
+++ b/sycl/test-e2e/syclcompat/device/device.cpp
@@ -107,6 +107,47 @@ void test_create_queue_arguments() {
   assert(!q_out_order.is_in_order());
 }
 
+void test_version_parsing_case(const std::string &ver_string,
+                               int expected_major, int expected_minor) {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  int major;
+  int minor;
+  syclcompat::detail::parse_version_string(ver_string, major, minor);
+  if (major != expected_major || minor != expected_minor) {
+    std::cout << "Failed comparing " << ver_string << " major " << major
+              << " expected_major " << expected_major << " minor " << minor
+              << " expected_minor " << expected_minor << std::endl;
+    assert(false);
+  }
+  assert(major == expected_major);
+  assert(minor == expected_minor);
+}
+
+void test_version_parsing() {
+  test_version_parsing_case("3.0", 3, 0);
+  test_version_parsing_case("3.0 NEO", 3, 0);
+  test_version_parsing_case("OpenCL 3.0 NEO", 3, 0);
+  test_version_parsing_case("OpenCL 3.0 (Build 0)", 3, 0);
+  test_version_parsing_case("8.6", 8, 6);
+  test_version_parsing_case("8.0", 8, 0);
+  test_version_parsing_case("7.5", 7, 5);
+  test_version_parsing_case("1.3", 1, 3);
+  test_version_parsing_case("11.4", 11, 4);
+  test_version_parsing_case("0.1", 0, 1);
+  test_version_parsing_case("gfx1030", 1030, 0);
+}
+
+// We have *some* constraints on the major version that we can check
+void test_major_version(sycl::device &dev, int major) {
+  auto backend = dev.get_backend();
+  if (backend == sycl::backend::opencl) {
+    assert(major == 1 || major == 3);
+  } else if (backend == sycl::backend::ext_oneapi_level_zero ||
+             backend == sycl::backend::ext_oneapi_cuda) {
+    assert(major < 99);
+  }
+}
+
 /*
   Device Extension Tests
 */
@@ -115,7 +156,8 @@ void test_device_ext_api() {
   DeviceExtFixt dev_ext;
   auto &dev_ = dev_ext.get_dev_ext();
   dev_.is_native_host_atomic_supported();
-  dev_.get_major_version();
+  auto major = dev_.get_major_version();
+  test_major_version(dev_, major);
   dev_.get_minor_version();
   dev_.get_max_compute_units();
   dev_.get_max_clock_frequency();
@@ -134,6 +176,15 @@ void test_device_ext_api() {
   auto Context = dev_.get_context();
 }
 
+void test_device_api() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  DeviceExtFixt dev_ext;
+  auto &dev_ = dev_ext.get_dev_ext();
+  auto major = get_major_version(dev_);
+  test_major_version(dev_, major);
+  get_minor_version(dev_);
+}
+
 void test_default_saved_queue() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
   DeviceExtFixt dev_ext;
@@ -318,10 +369,12 @@ int main() {
   test_check_default_device();
   test_create_queue_arguments();
   test_device_ext_api();
+  test_device_api();
   test_default_saved_queue();
   test_saved_queue();
   test_reset();
   test_device_info_api();
+  test_version_parsing();
   test_image_max_attrs();
   test_max_nd_range();
 

From 20991b1c2ee906148706aa1e7ae62c1084834799 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Wed, 5 Jun 2024 08:48:18 +0100
Subject: [PATCH 04/55] [UR] Bump CUDA tag to 0e38fda0 (#14030)

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2b6d119859bbe..80eb80062aa9a 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -120,7 +120,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(cuda
     ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    # commit 0e38fda02ca00aaab28018240eb526da7dd08f56
+    # Merge: f4968809 b7c8ac18
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Tue Jun 4 14:18:15 2024 +0100
+    #     Merge pull request #1552 from mmoadeli/atomic-access-on-host-malloc
+    #     [CUDA] Remove the support of concurrent atomic access to host allocated pinned memory.
+    0e38fda02ca00aaab28018240eb526da7dd08f56
   )
 
   fetch_adapter_source(hip

From 2838f40382bedddbda0a5f20ebeeba86310044da Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 5 Jun 2024 09:20:03 +0100
Subject: [PATCH 05/55] [SYCL][Graph][L0] Correctly report when device supports
 update (#13987)

Bump UR L0 commit to
https://github.com/oneapi-src/unified-runtime/pull/1694 so that the SYCL
device aspect for supporting update in graphs is correctly reported for
L0 devices. Currently, support can be incorrectly reported.

---------

Co-authored-by: Kenneth Benzie (Benie) <k.benzie83@gmail.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 80eb80062aa9a..2ddd5d13754c6 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -110,7 +110,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(level_zero
     ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    # commit 8e47ab5057458c5522ce5d86f7996b2882020220
+    # Merge: b816fe82 3d3aba65
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Tue Jun 4 17:28:43 2024 +0100
+    #     Merge pull request #1694 from Bensuo/ewan/L0_update_query
+    #     Set minimum L0 device support to support UR kernel update
+    8e47ab5057458c5522ce5d86f7996b2882020220
   )
 
   fetch_adapter_source(opencl

From 493e78be6020ef436634b21d93069467fa6c69e7 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 5 Jun 2024 09:28:17 +0100
Subject: [PATCH 06/55] [SYCL][Graph] Fix PI Kernel leak in graph update
 (#14029)

Running the `Graph/Update` E2E tests on Level Zero with
`UR_L0_LEAKS_DEBUG=1` shows that we are leaking a PI kernel and module.

On investigation this was because we are retaining these objects in
`getOrCreateKernel()` but not releasing them. Added release calls
similar to how it is done in
[enqueueImpCommandBufferKernel](https://github.com/intel/llvm/blob/b49303c7e13ca0a69454eaaaeb8c3d094916218d/sycl/source/detail/scheduler/commands.cpp#L2550)
by the scheduler
---
 sycl/source/detail/graph_impl.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 5071c3d982066..329eab2aaf832 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -1303,6 +1303,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
   auto NDRDesc = ExecCG.MNDRDesc;
 
   pi_kernel PiKernel = nullptr;
+  pi_program PiProgram = nullptr;
   auto Kernel = ExecCG.MSyclKernel;
   auto KernelBundleImplPtr = ExecCG.MKernelBundle;
   std::shared_ptr<sycl::detail::kernel_impl> SyclKernelImpl = nullptr;
@@ -1326,7 +1327,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
     PiKernel = Kernel->getHandleRef();
     EliminatedArgMask = Kernel->getKernelArgMask();
   } else {
-    std::tie(PiKernel, std::ignore, EliminatedArgMask, std::ignore) =
+    std::tie(PiKernel, std::ignore, EliminatedArgMask, PiProgram) =
         sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
             ContextImpl, DeviceImpl, ExecCG.MKernelName);
   }
@@ -1450,6 +1451,12 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
       sycl::detail::PiApiKind::piextCommandBufferUpdateKernelLaunch>(
       Command, &UpdateDesc);
 
+  if (PiProgram) {
+    // We retained these objects by calling getOrCreateKernel()
+    Plugin->call<sycl::detail::PiApiKind::piKernelRelease>(PiKernel);
+    Plugin->call<sycl::detail::PiApiKind::piProgramRelease>(PiProgram);
+  }
+
   if (Res != PI_SUCCESS) {
     throw sycl::exception(errc::invalid, "Error updating command_graph");
   }

From 0cec12826baea60a15483081b0feece49013049f Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Wed, 5 Jun 2024 11:20:25 +0100
Subject: [PATCH 07/55] [UR] Bump HIP tag to 399430da (#14037)

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2ddd5d13754c6..7257b5e1734a9 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -137,7 +137,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(hip
     ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    # commit 399430da9613269a0dc8ddf605415942e0a37e9c
+    # Merge: 39cb69ab 10b04891
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Tue Jun 4 17:15:33 2024 +0100
+    #     Merge pull request #1688 from frasercrmck/hip-enqueue-mem-readwritecopy
+    #     [HIP] Various fixes for urEnqueueMemImage(Read|Write|Copy)
+    399430da9613269a0dc8ddf605415942e0a37e9c
   )
 
   fetch_adapter_source(native_cpu

From f6659683d9c559973b3bf31d15ebc48c1397b0a3 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 5 Jun 2024 17:06:19 +0100
Subject: [PATCH 08/55] [CI][CUDA] Uplift docker to use cuda 12.5 image.
 (#14049)

This upgrades the docker to use the cuda 12.5 image.

I've ran the test-e2e locally using cuda 12.5 and all is well. cuda 12.5
also fixed an issue introduced by the cuda 12.4 driver: see
https://github.com/intel/llvm/pull/13661#issuecomment-2140088722

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 devops/containers/ubuntu2204_build.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devops/containers/ubuntu2204_build.Dockerfile b/devops/containers/ubuntu2204_build.Dockerfile
index 313b455dbc25b..cc511a23823c6 100644
--- a/devops/containers/ubuntu2204_build.Dockerfile
+++ b/devops/containers/ubuntu2204_build.Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 

From 643d21f9d2f408bd9c844041c23bd185f14c5f11 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:32:56 -0700
Subject: [PATCH 09/55] [ESIMD] Allow full autodeduction for
 block_load/block_store and slm_block_load/slm_block_store APIs accepting
 simd_view (#13978)

Co-authored-by: Nick Sarnie <sarnex@users.noreply.github.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 1095 +++++++++++++++--
 .../esimd/memory_properties_load_store.cpp    |  128 +-
 2 files changed, 1105 insertions(+), 118 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index b8f36e8f57255..5eabda41a4df4 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -1971,6 +1971,53 @@ block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
   return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, pass_thru);
 }
 
+/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
+///                       PassThruSimdViewT pass_thru, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block from USM pointer \p ptr. If
+/// the predicate \p pred is set to 0, then the load is omitted and the vector
+/// \p pass_thru is returned.
+///
+/// This function has temporary restrictions. See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions are applied
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint and mask imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT, typename T,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(const T *ptr, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(ptr, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N> block_load(const T* ptr, size_t byte_offset,
 ///                       simd_mask<1> pred, simd<T, N> pass_thru,
 ///                       props={});                               // (usm-bl-6)
@@ -2017,6 +2064,55 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
   return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
 }
 
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block from address referenced
+/// by USM pointer \p ptr and the given \p byte_offset.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// vector \p pass_thru is returned.
+///
+/// This function has temporary restrictions. See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions are applied
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint and mask imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT, typename T,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
+           PassThruSimdViewT pass_thru, PropertyListT props = {}) {
+  return block_load<T, N>(ptr, byte_offset, pred, pass_thru.read(), props);
+}
+
 /// Loads a contiguous block of memory from the given memory address \p addr
 /// and returns the loaded data as a vector.
 /// The generated code depends on the combination {T, N, Flags}.
@@ -2294,6 +2390,57 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
                                                          pass_thru);
 }
 
+/// simd<T, N>
+/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
+///            PassThruSimdViewT pass_thru, props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and the given \p byte_offset.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// \p pass_thru value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+///     or  smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_device_accessor_with_v<AccessorT,
+                                          detail::accessor_mode_cap::can_read>,
+    simd<T, N>>
+block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+           simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(acc, byte_offset, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N>
 /// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
 ///            props = {});                                        // (acc-bl-4)
@@ -2383,6 +2530,53 @@ block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
   return block_load<T, N>(acc, 0, pred, pass_thru, NewPropertyListT{});
 }
 
+/// block_load(AccessorT acc, simd_mask<1> pred,
+///            PassThruSimdViewT pass_thru, props = {});
+///
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and implied offset=0.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// \p pass_thru value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props
+/// specifies the alignment property, then it is ignored because this
+/// variant implies zero offset, which means the most favourable 16-byte
+/// alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R2: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_device_accessor_with_v<AccessorT,
+                                          detail::accessor_mode_cap::can_read>,
+    simd<T, N>>
+block_load(AccessorT acc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(acc, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N>
 /// block_load(AccessorT acc, simd_mask<1> pred, props = {});      // (acc-bl-6)
 /// This function loads a contiguous memory block referenced
@@ -2638,29 +2832,15 @@ block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
   block_store<T, N>(AdjustedPtr, vals, pred, props);
 }
 
-/// Each of the following block_store functions stores the vector 'vals' to a
-/// contiguous memory block at the address referenced by accessor 'acc', or from
-/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If
-/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store
-/// operation is a NO-OP. The parameter 'props' specifies the optional
-/// compile-time properties of the type esimd::properties and may include
-/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3,
-/// esimd::alignment.
-
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
-///                   simd<T, N> vals, props = {});
-
-/// void block_store(AccessorT acc, simd<T, N> vals, props = {}); // (acc-bs-2)
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-3)
-///     simd<T, N> vals, simd_mask<1> pred, props = {});
-
-/// void block_store(AccessorT acc, simd<T, N> vals,              // (acc-bs-4)
-///                  simd_mask<1> pred, props = {});
-
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
-///                   simd<T, N> vals, props = {});
-/// This function stores a contiguous memory block to
-/// accessor \p acc and \p byte_offset with data specified by \p vals.
+/// void block_store(T* ptr, ValuesSimdViewT vals, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// with data specified by \p vals.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
 ///
 /// The parameter \p props specifies the optional compile-time properties
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
@@ -2670,57 +2850,259 @@ block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
-/// from the below restrictions, and must be at least 4-byte aligned for
-/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
-/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
-/// then the target device must be DG2 or PVC (not Gen12). The alignment
-/// requirement may be less strict if stateless memory mode is ON, see
-/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
-///
-/// Restrictions: there may be some extra restrictions depending on
-///    a) stateless memory mode enforcement is ON,
-///    b) cache hints are used,
-///    c) number of bytes stored is either 16,32,64, or 128.
-///    d) the 'alignment' property is specified as less than 16 bytes.
-///
-/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
-/// Gen12).
-/// If (a) && !(b), then there is no restriction on the number of
-/// elements to be stored and \p byte_offset must be only element-aligned.
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints, and the minimally required element-size
+/// alignment otherwise. Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
 ///
-/// Gen12 requirements: !(b) && (c) && !(d).
-///   It can store 16-, 32-, 64-, or 128-bytes only.
-/// DG2/PVC requirements:
-///   It can store such number of elements depending on the type 'T':
-///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+/// Restrictions - cache hint imposed - temporary:
+/// If L1 or L2 cache hint is passed, then:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
 ///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
 ///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
 ///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
 template <
-    typename T, int N, typename AccessorT,
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
-__ESIMD_API std::enable_if_t<
-    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-    detail::is_device_accessor_with_v<AccessorT,
-                                      detail::accessor_mode_cap::can_write>>
-block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
-            simd<T, N> vals, PropertyListT props = {}) {
-#ifdef __ESIMD_FORCE_STATELESS_MEM
-  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
-                    props);
-#else
-  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
-  constexpr size_t Alignment =
-      detail::getPropertyValue<PropertyListT, alignment_key>(
-          DefaultLSCAlignment);
-  constexpr bool AlignmentRequiresLSC =
-      PropertyListT::template has_property<alignment_key>() && Alignment < 16;
-  using Tx = detail::__raw_t<T>;
-  constexpr unsigned Sz = sizeof(Tx) * N;
+__ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
+                             detail::is_property_list_v<PropertyListT>>
+block_store(T *ptr, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(ptr, vals.read(), props);
+}
+
+/// void block_store(T* ptr, size_t byte_offset,
+///                          ValuesSimdViewT vals, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr and
+/// byte-offset \p byte_offset with data specified by \p vals.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints, and the minimally required element-size
+/// alignment otherwise. Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint imposed - temporary:
+/// If L1 or L2 cache hint is passed, then:
+/// R1: The pointer plus byte offset must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, byte_offset, vals.read(), props);
+}
+
+/// void block_store(T* ptr, ValuesSimdViewT vals,
+///             simd_mask<1> pred, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// with data specified by \p vals. If the predicate \p pred is set to 0,
+/// then the store is omitted.
+///
+/// There are temporary restrictions.  See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions apply (see
+/// Restrictions below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
+                             detail::is_property_list_v<PropertyListT>>
+block_store(T *ptr, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, vals.read(), pred, props);
+}
+
+/// void block_store(T* ptr, size_t byte_offset,
+/// ValuesSimdViewT vals, simd_mask<1> pred, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// and byte-offset \p byte_offset with data specified by \p vals.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints and \p pred is set to 1, and
+//  the minimally required element-size alignment otherwise.
+/// Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint or predicate imposed - temporary:
+/// If a predicate, L1 or L2 cache hint is passed, then:
+/// R1: The pointer plus byte offset must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, byte_offset, vals.read(), pred, props);
+}
+
+/// Each of the following block_store functions stores the vector 'vals' to a
+/// contiguous memory block at the address referenced by accessor 'acc', or from
+/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If
+/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store
+/// operation is a NO-OP. The parameter 'props' specifies the optional
+/// compile-time properties of the type esimd::properties and may include
+/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3,
+/// esimd::alignment.
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
+///                   simd<T, N> vals, props = {});
+
+/// void block_store(AccessorT acc, simd<T, N> vals, props = {}); // (acc-bs-2)
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-3)
+///     simd<T, N> vals, simd_mask<1> pred, props = {});
+
+/// void block_store(AccessorT acc, simd<T, N> vals,              // (acc-bs-4)
+///                  simd_mask<1> pred, props = {});
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
+///                   simd<T, N> vals, props = {});
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
+/// from the below restrictions, and must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
+/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
+/// then the target device must be DG2 or PVC (not Gen12). The alignment
+/// requirement may be less strict if stateless memory mode is ON, see
+/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+///    d) the 'alignment' property is specified as less than 16 bytes.
+///
+/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
+/// Gen12).
+/// If (a) && !(b), then there is no restriction on the number of
+/// elements to be stored and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c) && !(d).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename T, int N, typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            simd<T, N> vals, PropertyListT props = {}) {
+#ifdef __ESIMD_FORCE_STATELESS_MEM
+  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
+                    props);
+#else
+  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(
+          DefaultLSCAlignment);
+  constexpr bool AlignmentRequiresLSC =
+      PropertyListT::template has_property<alignment_key>() && Alignment < 16;
+  using Tx = detail::__raw_t<T>;
+  constexpr unsigned Sz = sizeof(Tx) * N;
   constexpr bool SzRequiresLSC =
       Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
       !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) ||
@@ -2871,6 +3253,204 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
   block_store<T, N>(acc, 0, vals, pred, NewPropertyListT{});
 }
 
+/// void block_store(AccessorT acc, OffsetT byte_offset,
+///                   ValuesSimdViewT vals, props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
+/// from the below restrictions, and must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
+/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
+/// then the target device must be DG2 or PVC (not Gen12). The alignment
+/// requirement may be less strict if stateless memory mode is ON, see
+/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+///    d) the 'alignment' property is specified as less than 16 bytes.
+///
+/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
+/// Gen12).
+/// If (a) && !(b), then there is no restriction on the number of
+/// elements to be stored and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c) && !(d).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(acc, byte_offset, vals.read(), props);
+}
+
+/// void block_store(AccessorT acc, ValuesSimdViewT vals, props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc with data specified by \p vals and implied offset=0.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12).
+/// If (a) && !(b), then there is no restriction on the number of elements
+/// to be stored.
+///
+/// Gen12 requirements: !(b) && (c).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(acc, vals.read(), props);
+}
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,
+///     ValuesSimdViewT vals, simd_mask<1> pred, props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+/// The alignment requirement may be less strict if stateless memory mode is ON,
+/// see block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions:
+/// R1: The target device must be DG2 or PVC (not Gen12).
+///
+/// R2:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            ValuesSimdViewT vals, simd_mask<1> pred, PropertyListT props = {}) {
+  block_store<T, N>(acc, byte_offset, vals.read(), pred, props);
+}
+
+/// void block_store(AccessorT acc, ValuesSimdViewT vals,
+///                  simd_mask<1> pred, props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc with data specified by \p vals and implied offset=0.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions:
+/// R1: The target device must be DG2 or PVC (not Gen12).
+///
+/// R2:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(acc, vals.read(), pred, props);
+}
+
 /// @} sycl_esimd_memory_block
 
 /// @} sycl_esimd_memory
@@ -6350,6 +6930,51 @@ slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
   return Result.template bit_cast_view<T>();
 }
 
+/// simd<T, N> slm_block_load(uint32_t byte_offset,
+///                           simd_mask<1> pred,
+///                           PassThruSimdViewT pass_thru, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Loads a contiguous memory block from SLM (Shared Local Memory) at the
+/// given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP.
+/// The parameter 'pass_thru' specifies the values being copied to the returned
+/// result if 'pred' is set to 0.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+slm_block_load(uint32_t offset, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+               PropertyListT props = {}) {
+  return slm_block_load<T, N>(offset, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
 ///                       props={});                              // (lacc-bl-1)
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
@@ -6494,13 +7119,56 @@ __ESIMD_API std::enable_if_t<
                                      detail::accessor_mode_cap::can_read> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
-  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
+block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
+  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                              // (lacc-bl-5)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// the local accessor \p lacc at the given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p lacc + \p byte_offset must be at least 4-byte aligned for 4-byte
+///     or smaller elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename T, int N, typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
+           simd<T, N> pass_thru, PropertyListT props = {}) {
+  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
+  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
 }
 
 /// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
-///                       simd_mask<1> pred, simd<T, N> pass_thru,
-///                       props={});                              // (lacc-bl-5)
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
 /// the local accessor \p lacc at the given \p byte_offset.
 /// The parameter \p pred is the one-element predicate. If it is set to 1,
@@ -6526,17 +7194,20 @@ block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
 ///                      or 512(only if alignment is 8-bytes or more).
 /// R3: The target device must be DG2, PVC or newer GPU.
 template <
-    typename T, int N, typename AccessorT,
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    detail::is_local_accessor_with_v<AccessorT,
-                                     detail::accessor_mode_cap::can_read> &&
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        detail::is_local_accessor_with_v<AccessorT,
+                                         detail::accessor_mode_cap::can_read> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
-           simd<T, N> pass_thru, PropertyListT props = {}) {
-  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
-  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
+           PassThruSimdViewT pass_thru, PropertyListT props = {}) {
+  return block_load<T, N>(lacc, byte_offset, pred, pass_thru.read(), props);
 }
 
 /// simd<T, N> block_load(local_accessor lacc,
@@ -6579,6 +7250,51 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
                               pass_thru, props);
 }
 
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at zero offset.
+///
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The local accessor \p lacc must point to memory at least 4-byte aligned
+///     for elements of 4-bytes or smaller and 8-byte aligned for 8-byte
+///     elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+/// R2: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        detail::is_local_accessor_with_v<AccessorT,
+                                         detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(lacc, pred, pass_thru.read(), props);
+}
+
 /// Stores elements of the vector \p vals to a contiguous block of SLM memory
 /// at the given byte-offset \p offset.
 /// The generated code depends on the combination {T, N, Flags}.
@@ -6745,6 +7461,76 @@ slm_block_store(uint32_t byte_offset, simd<T, N> vals,
       sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
 }
 
+/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+///                      simd_mask<1> pred, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) at the given \p byte_offset. The parameter \p pred is the
+/// one-element predicate. If it is set to 1, then all 'N' elements are stored.
+/// Otherwise, the block stored operation is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
+                PropertyListT props = {}) {
+  slm_block_store<T, N>(byte_offset, vals.read(), pred, props);
+}
+
+/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+///                      props = {});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM
+/// (Shared Local Memory) at the given \p byte_offset. The parameter 'props'
+/// specifies the optional compile-time properties list. Only esimd::alignment
+/// property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+                PropertyListT props = {}) {
+  slm_block_store<T, N>(byte_offset, vals.read(), props);
+}
+
 /// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-1)
 ///                  simd<T, N> vals, props={});
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
@@ -6876,6 +7662,159 @@ block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
             PropertyListT props = {}) {
   slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
 }
+
+/// void block_store(local_accessor lacc, uint32_t byte_offset,
+///                  ValuesSimdViewT vals, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc at the given \p
+/// byte_offset. The parameter 'props' specifies the optional compile-time
+/// properties list. Only esimd::alignment property is used. Other properties
+/// are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
+            PropertyListT props = {}) {
+  block_store<T, N>(lacc, byte_offset, vals.read(), props);
+}
+
+/// void block_store(local_accessor lacc, ValuesSimdViewT vals,
+///                  props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM
+/// (Shared Local Memory) associated with the local accessor \p lacc. The
+/// parameter 'props' specifies the optional compile-time properties list. Only
+/// esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(lacc, vals.read(), props);
+}
+
+/// void block_store(local_accessor lacc, uint32_t byte_offset,
+///                  ValuesSimdViewT vals, simd_mask<1> pred, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc at the given \p
+/// byte_offset. The parameter \p pred is the one-element predicate. If it is
+/// set to 1, then all 'N' elements are stored. Otherwise, the block store
+/// operation is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
+            simd_mask<1> pred, PropertyListT props = {}) {
+  block_store<T, N>(lacc, byte_offset, vals.read(), pred, props);
+}
+
+/// void block_store(local_accessor lacc, ValuesSimdViewT vals,
+///                  simd_mask<1> pred, props={});
+/// Variation of the API that allows to use \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc. The parameter \p pred is
+/// the one-element predicate. If it is set to 1, then all 'N' elements are
+/// stored. Otherwise, the block store operation is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(lacc, vals.read(), pred, props);
+}
 namespace detail {
 
 // lsc_atomic_update() operations may share atomic_op values for data types
diff --git a/sycl/test/esimd/memory_properties_load_store.cpp b/sycl/test/esimd/memory_properties_load_store.cpp
index 4691824920018..c1e465536268f 100644
--- a/sycl/test/esimd/memory_properties_load_store.cpp
+++ b/sycl/test/esimd/memory_properties_load_store.cpp
@@ -85,7 +85,11 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
 
   constexpr int N = 4;
   simd<float, N> pass_thru = 1;
+  auto pass_thru_view = pass_thru.select<N, 1>();
+
   simd<int, N> pass_thrui = 1;
+  auto pass_thrui_view = pass_thrui.select<N, 1>();
+
   const int *ptri = reinterpret_cast<const int *>(ptrf);
   const int8_t *ptrb = reinterpret_cast<const int8_t *>(ptrf);
 
@@ -102,8 +106,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   simd_mask<1> mask = 1;
   auto d4 = block_load<float, N>(ptrf, mask, props_a);
 
-  // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
+  // CHECK-COUNT-3: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto d5 = block_load<float, N>(ptrf, mask, pass_thru, props_b);
+  d5 = block_load(ptrf, mask, pass_thru, props_b);
+  d5 = block_load(ptrf, mask, pass_thru_view, props_b);
 
   // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto d6 = block_load<float, N>(ptrf, byte_offset32, mask, props_a);
@@ -111,8 +117,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d7 = block_load<int, N>(ptri, byte_offset64, mask, props_b);
 
-  // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-COUNT-3: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d8 = block_load<int, N>(ptri, byte_offset32, mask, pass_thrui, props_a);
+  d8 = block_load(ptri, byte_offset32, mask, pass_thrui, props_a);
+  d8 = block_load(ptri, byte_offset32, mask, pass_thrui_view, props_a);
 
   // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d9 = block_load<int, N>(ptri, byte_offset64, mask, pass_thru, props_b);
@@ -149,9 +157,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto a4 = block_load<float, N>(acc, mask, props_a);
 
-  // CHECK-STATEFUL:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto a5 = block_load<float, N>(acc, mask, pass_thru, props_b);
+  a5 = block_load<float, N>(acc, mask, pass_thru_view, props_b);
 
   // CHECK-STATEFUL:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
   // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
@@ -161,13 +170,15 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a7 = block_load<int, N>(acc, byte_offset64, mask, props_b);
 
-  // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a8 = block_load<int, N>(acc, byte_offset32, mask, pass_thru, props_a);
+  a8 = block_load<int, N>(acc, byte_offset32, mask, pass_thru_view, props_a);
 
-  // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a9 = block_load<int, N>(acc, byte_offset64, mask, pass_thrui, props_b);
+  a9 = block_load<int, N>(acc, byte_offset64, mask, pass_thrui_view, props_b);
 
   // Now try block_load without cache hints and using the mask to verify
   // svm/legacy code-gen. Also, intentially use vector lengths that are
@@ -195,9 +206,11 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   auto slm_bl2 = slm_block_load<double, 8>(byte_offset32, mask, props_c16);
 
   simd<double, 8> pass_thrud = 2.0;
-  // CHECK: call <8 x double> @llvm.genx.lsc.load.merge.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <8 x double> {{[^)]+}})
+  auto pass_thrud_view = pass_thrud.select<8, 1>();
+  // CHECK-COUNT-2: call <8 x double> @llvm.genx.lsc.load.merge.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <8 x double> {{[^)]+}})
   auto slm_bl3 =
       slm_block_load<double, 8>(byte_offset32, mask, pass_thrud, props_c16);
+  slm_bl3 = slm_block_load(byte_offset32, mask, pass_thrud_view, props_c16);
 
   // Now try block_load() accepting local accessor.
 
@@ -210,19 +223,24 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK: call <8 x double> @llvm.genx.lsc.load.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
   auto lacc_bl3 = block_load<double, 8>(local_acc, mask, props_a);
 
-  // CHECK: call <16 x double> @llvm.genx.lsc.load.merge.slm.v16f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 6, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <16 x double> {{[^)]+}})
+  // CHECK-COUNT-2: call <16 x double> @llvm.genx.lsc.load.merge.slm.v16f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 6, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <16 x double> {{[^)]+}})
   simd<double, 16> pass_thrud16 = 2.0;
+  auto pass_thrud16_view = pass_thrud16.select<16, 1>();
   auto lacc_bl4 =
       block_load<double, 16>(local_acc, mask, pass_thrud16, props_b);
+  lacc_bl4 = block_load(local_acc, mask, pass_thrud16_view, props_b);
 
   // CHECK: call <32 x double> @llvm.genx.lsc.load.slm.v32f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
   auto lacc_bl5 =
       block_load<double, 32>(local_acc, byte_offset32, mask, props_a);
 
-  // CHECK: call <4 x double> @llvm.genx.lsc.load.merge.slm.v4f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <4 x double> {{[^)]+}})
+  // CHECK-COUNT-2: call <4 x double> @llvm.genx.lsc.load.merge.slm.v4f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <4 x double> {{[^)]+}})
   simd<double, 4> pass_thrud4 = 2.0;
+  auto pass_thrud4_view = pass_thrud4.select<4, 1>();
   auto lacc_bl6 = block_load<double, 4>(local_acc, byte_offset32, mask,
                                         pass_thrud4, props_a);
+  lacc_bl6 =
+      block_load(local_acc, byte_offset32, mask, pass_thrud4_view, props_a);
 
   // Check the default/assumed alignment when the alignment property is
   // not specified explicitly.
@@ -255,105 +273,135 @@ test_block_store(AccType &acc, LocalAccType &local_acc, float *ptrf,
   simd_mask<1> mask = 1;
   auto view = vals.select<N, 1>();
   auto viewi = valsi.select<N, 1>();
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, vals, store_props_a);
+  block_store(ptrf, view, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(ptri, byte_offset32, valsi, store_props_a);
+  block_store(ptri, byte_offset32, viewi, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, byte_offset64, vals, store_props_c);
+  block_store(ptrf, byte_offset64, view, store_props_c);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, vals, mask, store_props_a);
+  block_store(ptrf, view, mask, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(ptri, byte_offset64, valsi, mask, store_props_c);
+  block_store(ptri, byte_offset64, viewi, mask, store_props_c);
 
   // Test SVM/legacy USM block store
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(ptrf, vals, store_props_b);
+  block_store(ptrf, view, store_props_b);
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 8
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 8
   block_store(ptrf, vals, store_props_d);
+  block_store(ptrf, view, store_props_d);
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(ptrf, byte_offset32, vals, store_props_b);
+  block_store(ptrf, byte_offset32, view, store_props_b);
 
   // Test accessor block store
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, vals, store_props_a);
+  block_store(acc, view, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset32, valsi, store_props_a);
+  block_store(acc, byte_offset32, viewi, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset64, vals, store_props_c);
+  block_store(acc, byte_offset64, view, store_props_c);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, vals, mask, store_props_a);
+  block_store(acc, view, mask, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset64, valsi, mask, store_props_c);
+  block_store(acc, byte_offset64, viewi, mask, store_props_c);
 
   // Test accessor SVM/legacy block store
 
-  // CHECK-STATEFUL:  call void @llvm.genx.oword.st.v4f32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
-  // CHECK-STATELESS: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.oword.st.v4f32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(acc, vals, store_props_b);
+  block_store(acc, view, store_props_b);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.oword.st.v4i32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: store <4 x i32> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.oword.st.v4i32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: store <4 x i32> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(acc, byte_offset32, valsi, store_props_b);
+  block_store(acc, byte_offset32, viewi, store_props_b);
 
   // Now try SLM block_store() with and without cache hints that are ignored.
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
   slm_block_store<float, N>(byte_offset32, vals, store_props_b);
   slm_block_store<float, N>(byte_offset32, view, store_props_b);
   slm_block_store<float, N>(byte_offset32, view.select<N, 1>(), store_props_b);
+  slm_block_store(byte_offset32, view, store_props_b);
+  slm_block_store(byte_offset32, view.select<N, 1>(), store_props_b);
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
   slm_block_store<float, N>(byte_offset32, vals, store_props_a);
   slm_block_store<float, N>(byte_offset32, view, store_props_a);
   slm_block_store<float, N>(byte_offset32, view.select<N, 1>(), store_props_a);
+  slm_block_store(byte_offset32, view, store_props_a);
+  slm_block_store(byte_offset32, view.select<N, 1>(), store_props_a);
 
   // Now try SLM block_store() with a predicate.
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   slm_block_store<int, N>(byte_offset32, valsi, mask, store_props_b);
   slm_block_store<int, N>(byte_offset32, viewi, mask, store_props_b);
   slm_block_store<int, N>(byte_offset32, viewi.select<N, 1>(), mask,
                           store_props_b);
+  slm_block_store(byte_offset32, viewi, mask, store_props_b);
+  slm_block_store(byte_offset32, viewi.select<N, 1>(), mask, store_props_b);
 
   // Now try block_store() accepting local accessor.
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
   block_store<float, N>(local_acc, vals, store_props_d);
   block_store<float, N>(local_acc, view, store_props_d);
   block_store<float, N>(local_acc, view.select<N, 1>(), store_props_d);
+  block_store(local_acc, view, store_props_d);
+  block_store(local_acc, view.select<N, 1>(), store_props_d);
 
-  // CHECK-COUNT-3: store <4 x i32> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
+  // CHECK-COUNT-5: store <4 x i32> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
   block_store<int, N>(local_acc, byte_offset32, valsi, store_props_d);
   block_store<int, N>(local_acc, byte_offset32, viewi, store_props_d);
   block_store<int, N>(local_acc, byte_offset32, viewi.select<N, 1>(),
                       store_props_d);
+  block_store(local_acc, byte_offset32, viewi, store_props_d);
+  block_store(local_acc, byte_offset32, viewi.select<N, 1>(), store_props_d);
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store<float, N>(local_acc, vals, mask, store_props_a);
   block_store<float, N>(local_acc, view, mask, store_props_a);
   block_store<float, N>(local_acc, view.select<N, 1>(), mask, store_props_a);
+  block_store(local_acc, view, mask, store_props_a);
+  block_store(local_acc, view.select<N, 1>(), mask, store_props_a);
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store<int, N>(local_acc, byte_offset32, valsi, mask, store_props_c);
   block_store<int, N>(local_acc, byte_offset32, viewi, mask, store_props_c);
   block_store<int, N>(local_acc, byte_offset32, viewi.select<N, 1>(), mask,
                       store_props_c);
+  block_store(local_acc, byte_offset32, viewi, mask, store_props_c);
+  block_store(local_acc, byte_offset32, viewi.select<N, 1>(), mask,
+              store_props_c);
 }
\ No newline at end of file

From 7ff1a29cfa872227f9326d67ad7c00d7ff25707d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 5 Jun 2024 12:33:40 -0400
Subject: [PATCH 10/55] [SYCL] Allow specifying -foffload-lto with the new
 offload driver and build libdevice with thinLTO (#14036)

This is the first change in my work on thinLTO for SYCL.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp       | 11 ++++++++---
 clang/test/Driver/sycl-lto.cpp              |  9 +++++++++
 libdevice/cmake/modules/SYCLLibdevice.cmake | 20 ++++++++++----------
 3 files changed, 27 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/Driver/sycl-lto.cpp

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2720ea5c9af6e..77c94ea60d315 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5843,10 +5843,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-llvm-uselists");
 
     if (IsUsingLTO) {
+      bool IsUsingOffloadNewDriver =
+          Args.hasFlag(options::OPT_offload_new_driver,
+                       options::OPT_no_offload_new_driver, false);
+      bool IsSYCLLTOSupported = JA.isDeviceOffloading(Action::OFK_SYCL) &&
+                                Triple.isSPIROrSPIRV() &&
+                                IsUsingOffloadNewDriver;
       if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) &&
-          !Args.hasFlag(options::OPT_offload_new_driver,
-                        options::OPT_no_offload_new_driver, false) &&
-          !Triple.isAMDGPU()) {
+          !IsUsingOffloadNewDriver && !Triple.isAMDGPU() &&
+          !IsSYCLLTOSupported) {
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Args.getLastArg(options::OPT_foffload_lto,
                                options::OPT_foffload_lto_EQ)
diff --git a/clang/test/Driver/sycl-lto.cpp b/clang/test/Driver/sycl-lto.cpp
new file mode 100644
index 0000000000000..9e466093fee3c
--- /dev/null
+++ b/clang/test/Driver/sycl-lto.cpp
@@ -0,0 +1,9 @@
+// Verify the usage of -foffload-lto with SYCL.
+
+// Verify we error when using the old offload driver.
+// RUN: not %clangxx -fsycl -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_ERROR %s
+// CHECK_ERROR: unsupported option '-foffload-lto=thin' for target 'spir64-unknown-unknown'
+
+// Verify there's no error and we see the expected cc1 flags with the new offload driver.
+// RUN: %clangxx -fsycl --offload-new-driver -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_SUPPORTED %s
+// CHECK_SUPPORTED: clang{{.*}} "-cc1" "-triple" "spir64-unknown-unknown" {{.*}} "-flto=thin" "-flto-unit"
diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index e8c96d0099823..d40c51bffd178 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -91,7 +91,7 @@ function(add_devicelib_obj obj_filename)
 
   set(devicelib-obj-file-new-offload ${obj_new_offload_binary_dir}/${obj_filename}.${new-offload-lib-suffix})
   add_custom_command(OUTPUT ${devicelib-obj-file-new-offload}
-                     COMMAND ${clang} -fsycl -c --offload-new-driver
+                     COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin
                              ${compile_opts} ${sycl_targets_opt} ${OBJ_EXTRA_ARGS}
                              ${CMAKE_CURRENT_SOURCE_DIR}/${OBJ_SRC}
                              -o ${devicelib-obj-file-new-offload}
@@ -270,7 +270,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix}
-                   COMMAND ${clang} -fsycl -c --offload-new-driver
+                   COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_fp32_fallback_src} -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            -o ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix}
@@ -286,7 +286,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_fp32_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix}
@@ -321,7 +321,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${lib-suff
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix}
                    COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
-                           --offload-new-driver
+                           --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_fp64_fallback_src}
                            -o ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix}
@@ -337,7 +337,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_fp64_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix}
@@ -372,7 +372,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suff
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix}
                    COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
-                           --offload-new-driver
+                           --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_bf16_fallback_src}
                            -o ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix}
@@ -388,7 +388,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_bf16_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix}
@@ -437,7 +437,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp
                            -o ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp
@@ -453,7 +453,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp
                            -o ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp
@@ -469,7 +469,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp
                            -o ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp

From fbeb7d475a83d0d9f568ecb7e68cff0d091bf024 Mon Sep 17 00:00:00 2001
From: Buildbot for SYCL <bb-sycl@intel.com>
Date: Thu, 6 Jun 2024 00:55:37 +0800
Subject: [PATCH 11/55] [GHA] Uplift Linux IGC Dev RT version to
 igc-dev-8b999ec (#13963)

Scheduled igc dev drivers uplift

Co-authored-by: GitHub Actions <actions@github.com>
---
 devops/dependencies-igc-dev.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json
index 39729a23fec3a..f2c6de2217473 100644
--- a/devops/dependencies-igc-dev.json
+++ b/devops/dependencies-igc-dev.json
@@ -1,10 +1,10 @@
 {
   "linux": {
     "igc_dev": {
-      "github_tag": "igc-dev-4627f1f",
-      "version": "4627f1f",
-      "updated_at": "2024-05-26T23:48:05Z",
-      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1539236241/zip",
+      "github_tag": "igc-dev-8b999ec",
+      "version": "8b999ec",
+      "updated_at": "2024-05-30T02:09:07Z",
+      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1550749489/zip",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     }
   }

From aa92b2416d8ab1df3bcc27b3068f702eacc1d23a Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:03:27 -0400
Subject: [PATCH 12/55] [SYCL][E2E] Disable flaky profiling_queue.cpp test on
 CUDA (#14054)

See https://github.com/intel/llvm/issues/14053

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/ProfilingTag/profiling_queue.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
index 7ae3235d20ff0..d0da7612d4ea9 100644
--- a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
+++ b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
@@ -19,6 +19,10 @@
 // FPGA emulator seems to return unexpected start time for the fallback barrier.
 // UNSUPPORTED: accelerator
 
+// Flaky on CUDA
+// https://github.com/intel/llvm/issues/14053
+// UNSUPPORTED: cuda
+
 #include "common.hpp"
 
 int main() {

From 25f8a7cc7382f29dfdc61e7edb596533258ab241 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 5 Jun 2024 18:43:50 +0100
Subject: [PATCH 13/55] [test-e2e][cuda] Fully qualify `sycl::sub_group` to
 avoid namespace ambiguity (#14018)

This change avoids the ambiguity between the deprecated
`sycl::ext::oneapi::sub_group` and `sycl::sub_group` when both
namespaces are used. This fixes a failure on windows for cuda.

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
index 6613efa7dfe17..185a410fb3aef 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
@@ -50,9 +50,11 @@ void matrix_verify_lambda(queue q,
 
             auto sg = spmd_item.get_sub_group();
 
-            joint_matrix<sub_group, T, use::a, M, K, layout::row_major> sub_a;
-            joint_matrix<sub_group, T, use::b, K, N, layout::row_major> sub_b;
-            joint_matrix<sub_group, T2, use::accumulator, M, N> sub_c;
+            joint_matrix<sycl::sub_group, T, use::a, M, K, layout::row_major>
+                sub_a;
+            joint_matrix<sycl::sub_group, T, use::b, K, N, layout::row_major>
+                sub_b;
+            joint_matrix<sycl::sub_group, T2, use::accumulator, M, N> sub_c;
 
             joint_matrix_fill(sg, sub_a, 3);
             joint_matrix_fill(sg, sub_b, 1);

From 2dc80c3a30af210d4e0b2647bcc6c053007533e8 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:38:00 -0700
Subject: [PATCH 14/55] [ESIMD] Allow full autodeduction for slm_atomic_update
 and atomic_update for local_accessors API accepting simd_view (#14024)

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 616 +++++++++++++++++-
 .../esimd/memory_properties_atomic_update.cpp | 192 +++++-
 2 files changed, 786 insertions(+), 22 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 5eabda41a4df4..bf3d113e2df5f 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -550,7 +550,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             simd<OffsetT, N / VS> byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -591,7 +591,7 @@ gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -637,7 +637,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, simd<T, N> pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -711,7 +711,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 /// simd <T, N> gather(const T *p,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -772,7 +772,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
 /// simd <T, N> gather(const T *p,
 ///             OffsetSimdViewT byte_offsets,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.  Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements.
@@ -925,7 +925,7 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -993,7 +993,7 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// 	simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1033,7 +1033,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// 	PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1101,7 +1101,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T,N> vals,
 /// 	simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1140,7 +1140,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T,N> vals,
 /// 	PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1214,7 +1214,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///      PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -6049,7 +6049,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, simd<T, N> pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -6094,7 +6094,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -6143,7 +6143,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -8046,7 +8046,7 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
 /// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
-///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///               simd_mask<N> mask = 1);                       // (lacc-au1-1)
 ///
 
 /// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
@@ -8097,11 +8097,102 @@ slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
   }
 }
 
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), mask);
+}
+
 /// simd<T, N>
 /// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
-///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///               simd_mask<1> mask = 1);                       // (lacc-au1-1)
 ///
 /// Atomically updates \c N memory locations in SLM indicated by
 /// local accessor \p lacc and a vector of offsets, and returns a vector of old
@@ -8125,6 +8216,105 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
   return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
 }
 
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), mask);
+}
 /// Two argument variant of the atomic update operation.
 
 /// simd<T, N>
@@ -8137,7 +8327,7 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
 ///               simd<T, N> src1,
-///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+///               simd_mask<1> mask = 1);                      // (lacc-au2-1)
 ///
 
 /// simd<T, N>
@@ -8179,12 +8369,233 @@ slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
   }
 }
 
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1.read(), mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1.read(),
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0, simd<T, N> src1,
+                  simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), src1,
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1.read(),
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
+}
+
 /// simd<T, N>
 /// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
 ///               simd<T, N> src1,
-///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+///               simd_mask<N> mask = 1);                      // (lacc-au2-1)
 template <atomic_op Op, typename T, int N, typename AccessorT>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
                                  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
@@ -8195,6 +8606,175 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
   return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
 }
 
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0, src1.read(), mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1.read(),
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), src1,
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1.read(),
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(),
+                                 src1.read(), mask);
+}
+
 /// @} sycl_esimd_memory_slm
 
 namespace detail {
diff --git a/sycl/test/esimd/memory_properties_atomic_update.cpp b/sycl/test/esimd/memory_properties_atomic_update.cpp
index 594f52303e1ff..a71d64ef984d5 100644
--- a/sycl/test/esimd/memory_properties_atomic_update.cpp
+++ b/sycl/test/esimd/memory_properties_atomic_update.cpp
@@ -666,7 +666,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
   // Test slm_atomic_update with one operand.
   {
-    // CHECK-COUNT-14: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     {
       auto res_slm_atomic_1 =
           slm_atomic_update<atomic_op::add>(offsets, add, pred);
@@ -695,6 +695,28 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
           offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), pred);
       res_slm_atomic_8 = slm_atomic_update<atomic_op::add, int, VL>(
           offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+      res_slm_atomic_3 =
+          slm_atomic_update<atomic_op::add>(offsets, add_view, pred);
+      res_slm_atomic_4 = slm_atomic_update<atomic_op::add>(offsets, add_view);
+      res_slm_atomic_5 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add, pred);
+      res_slm_atomic_6 = slm_atomic_update<atomic_op::add>(offsets_view, add);
+      res_slm_atomic_7 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add_view, pred);
+      res_slm_atomic_8 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add_view);
+      res_slm_atomic_3 = slm_atomic_update<atomic_op::add>(
+          offsets, add_view.select<VL, 1>(), pred);
+      res_slm_atomic_4 =
+          slm_atomic_update<atomic_op::add>(offsets, add_view.select<VL, 1>());
+      res_slm_atomic_5 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add, pred);
+      res_slm_atomic_6 =
+          slm_atomic_update<atomic_op::add>(offsets_view.select<VL, 1>(), add);
+      res_slm_atomic_7 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), pred);
+      res_slm_atomic_8 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
     }
 
     // Expect LSC for short.
@@ -733,7 +755,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
   // Test slm_atomic_update with two operands.
   {
-    // CHECK-COUNT-30: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-58: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_1 =
         slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare, pred);
     auto res_atomic_2 =
@@ -810,6 +832,77 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    res_atomic_3 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap,
+                                                         compare_view, pred);
+    res_atomic_4 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare_view);
+
+    res_atomic_5 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view,
+                                                         compare, pred);
+    res_atomic_6 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view, compare);
+
+    res_atomic_7 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view,
+                                                         compare_view, pred);
+    res_atomic_8 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view, compare_view);
+
+    res_atomic_9 = slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap,
+                                                         compare, pred);
+    res_atomic_10 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap, compare);
+
+    res_atomic_11 = slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap,
+                                                          compare_view, pred);
+    res_atomic_12 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap, compare_view);
+
+    res_atomic_13 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare, pred);
+    res_atomic_14 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap_view, compare);
+
+    res_atomic_15 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare_view, pred);
+    res_atomic_16 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare_view);
+    res_atomic_3 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap, compare_view.select<VL, 1>(), pred);
+    res_atomic_4 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap, compare_view.select<VL, 1>());
+
+    res_atomic_5 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare, pred);
+    res_atomic_6 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare);
+
+    res_atomic_7 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(), pred);
+    res_atomic_8 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>());
+
+    res_atomic_9 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare, pred);
+    res_atomic_10 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare);
+
+    res_atomic_11 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(), pred);
+    res_atomic_12 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>());
+
+    res_atomic_13 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare, pred);
+    res_atomic_14 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare);
+
+    res_atomic_15 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_atomic_16 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       constexpr int VL = 16;
@@ -878,7 +971,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
   }
   // One operand atomic.
   {
-    // CHECK-COUNT-14: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_slm_atomic_1 =
         atomic_update<atomic_op::add>(local_acc, offsets, add, pred);
     auto res_slm_atomic_2 =
@@ -909,6 +1002,32 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
 
+    res_slm_atomic_3 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add_view, pred);
+    res_slm_atomic_4 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add_view);
+    res_slm_atomic_5 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add, pred);
+    res_slm_atomic_6 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add);
+    res_slm_atomic_7 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add_view, pred);
+    res_slm_atomic_8 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add_view);
+    res_slm_atomic_3 = atomic_update<atomic_op::add>(
+        local_acc, offsets, add_view.select<VL, 1>(), pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::add>(local_acc, offsets,
+                                                     add_view.select<VL, 1>());
+    res_slm_atomic_5 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add);
+    res_slm_atomic_7 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view.select<VL, 1>(),
+                                      add_view.select<VL, 1>(), pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       using LocalAccType = sycl::local_accessor<int16_t, 1>;
@@ -921,7 +1040,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
   }
   // Two operand atomic.
   {
-    // CHECK-COUNT-30: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-58: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg>(
         local_acc, offsets, swap, compare, pred);
     auto res_slm_atomic_2 =
@@ -991,6 +1110,71 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    res_slm_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view, pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::cmpxchg>(local_acc, offsets,
+                                                         swap, compare_view);
+    res_slm_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::cmpxchg>(local_acc, offsets,
+                                                         swap_view, compare);
+    res_slm_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare_view, pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare_view);
+    res_slm_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare, pred);
+    res_slm_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare);
+    res_slm_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare_view, pred);
+    res_slm_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare_view);
+    res_slm_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare, pred);
+    res_slm_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare);
+    res_slm_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare_view, pred);
+    res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare_view);
+    res_slm_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view.select<VL, 1>());
+    res_slm_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(), compare, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(), compare);
+    res_slm_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+    res_slm_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap, compare, pred);
+    res_slm_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap, compare);
+    res_slm_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap,
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap,
+        compare_view.select<VL, 1>());
+    res_slm_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare, pred);
+    res_slm_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare);
+    res_slm_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       using LocalAccType = sycl::local_accessor<int16_t, 1>;

From c7d627fb266f3a096e2acce8bacc0f1ddb7ac3dc Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:57:55 -0700
Subject: [PATCH 15/55] [ESIMD] Allow full autodeduction for prefetch APIs
 accepting simd_view (#14000)

Co-authored-by: Nick Sarnie <sarnex@users.noreply.github.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 136 +++++++++++++-----
 .../esimd/memory_properties_prefetch_2d.cpp   |  26 +++-
 2 files changed, 120 insertions(+), 42 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index bf3d113e2df5f..904dcfd8d7a45 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2067,7 +2067,7 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
 /// simd<T, N> block_load(const T* ptr, size_t byte_offset,
 ///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
 ///                       props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function loads a contiguous memory block from address referenced
 /// by USM pointer \p ptr and the given \p byte_offset.
@@ -2393,7 +2393,7 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
 /// simd<T, N>
 /// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
 ///            PassThruSimdViewT pass_thru, props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function loads a contiguous memory block referenced
 /// by accessor \p acc and the given \p byte_offset.
@@ -2533,7 +2533,7 @@ block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
 /// block_load(AccessorT acc, simd_mask<1> pred,
 ///            PassThruSimdViewT pass_thru, props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function loads a contiguous memory block referenced
 /// by accessor \p acc and implied offset=0.
@@ -2833,7 +2833,7 @@ block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
 }
 
 /// void block_store(T* ptr, ValuesSimdViewT vals, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to USM pointer \p ptr
 /// with data specified by \p vals.
@@ -2879,7 +2879,7 @@ block_store(T *ptr, ValuesSimdViewT vals, PropertyListT props = {}) {
 
 /// void block_store(T* ptr, size_t byte_offset,
 ///                          ValuesSimdViewT vals, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to USM pointer \p ptr and
 /// byte-offset \p byte_offset with data specified by \p vals.
@@ -2927,7 +2927,7 @@ block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals,
 
 /// void block_store(T* ptr, ValuesSimdViewT vals,
 ///             simd_mask<1> pred, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to USM pointer \p ptr
 /// with data specified by \p vals. If the predicate \p pred is set to 0,
@@ -2972,7 +2972,7 @@ block_store(T *ptr, ValuesSimdViewT vals, simd_mask<1> pred,
 
 /// void block_store(T* ptr, size_t byte_offset,
 /// ValuesSimdViewT vals, simd_mask<1> pred, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to USM pointer \p ptr
 /// and byte-offset \p byte_offset with data specified by \p vals.
@@ -3255,7 +3255,7 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
 
 /// void block_store(AccessorT acc, OffsetT byte_offset,
 ///                   ValuesSimdViewT vals, props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to
 /// accessor \p acc and \p byte_offset with data specified by \p vals.
@@ -3315,7 +3315,7 @@ block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
 }
 
 /// void block_store(AccessorT acc, ValuesSimdViewT vals, props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to
 /// accessor \p acc with data specified by \p vals and implied offset=0.
@@ -3362,7 +3362,7 @@ block_store(AccessorT acc, ValuesSimdViewT vals, PropertyListT props = {}) {
 
 /// void block_store(AccessorT acc, OffsetT byte_offset,
 ///     ValuesSimdViewT vals, simd_mask<1> pred, props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to
 /// accessor \p acc and \p byte_offset with data specified by \p vals.
@@ -3411,7 +3411,7 @@ block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
 
 /// void block_store(AccessorT acc, ValuesSimdViewT vals,
 ///                  simd_mask<1> pred, props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// This function stores a contiguous memory block to
 /// accessor \p acc with data specified by \p vals and implied offset=0.
@@ -4907,7 +4907,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -4948,7 +4948,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -4989,7 +4989,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -5037,7 +5037,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -5082,7 +5082,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -5125,7 +5125,7 @@ scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 /// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -6424,7 +6424,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///	         simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6456,7 +6456,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6491,7 +6491,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6528,7 +6528,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6566,7 +6566,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6598,7 +6598,7 @@ slm_scatter(simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// void slm_scatter(simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6933,7 +6933,7 @@ slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
 /// simd<T, N> slm_block_load(uint32_t byte_offset,
 ///                           simd_mask<1> pred,
 ///                           PassThruSimdViewT pass_thru, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Loads a contiguous memory block from SLM (Shared Local Memory) at the
 /// given \p byte_offset.
@@ -7167,7 +7167,7 @@ block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
 /// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
 ///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
 ///                       props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
 /// the local accessor \p lacc at the given \p byte_offset.
@@ -7253,7 +7253,7 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
 /// simd<T, N> block_load(local_accessor lacc,
 ///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
 ///                       props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
 /// with the local accessor \p lacc at zero offset.
@@ -7463,7 +7463,7 @@ slm_block_store(uint32_t byte_offset, simd<T, N> vals,
 
 /// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
 ///                      simd_mask<1> pred, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
 /// Memory) at the given \p byte_offset. The parameter \p pred is the
@@ -7503,7 +7503,7 @@ slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
 
 /// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
 ///                      props = {});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM
 /// (Shared Local Memory) at the given \p byte_offset. The parameter 'props'
@@ -7665,7 +7665,7 @@ block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
 
 /// void block_store(local_accessor lacc, uint32_t byte_offset,
 ///                  ValuesSimdViewT vals, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
 /// Memory) associated with the local accessor \p lacc at the given \p
@@ -7699,7 +7699,7 @@ block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
 
 /// void block_store(local_accessor lacc, ValuesSimdViewT vals,
 ///                  props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM
 /// (Shared Local Memory) associated with the local accessor \p lacc. The
@@ -7731,7 +7731,7 @@ block_store(AccessorT lacc, ValuesSimdViewT vals, PropertyListT props = {}) {
 
 /// void block_store(local_accessor lacc, uint32_t byte_offset,
 ///                  ValuesSimdViewT vals, simd_mask<1> pred, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
 /// Memory) associated with the local accessor \p lacc at the given \p
@@ -7775,7 +7775,7 @@ block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
 
 /// void block_store(local_accessor lacc, ValuesSimdViewT vals,
 ///                  simd_mask<1> pred, props={});
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
 /// Memory) associated with the local accessor \p lacc. The parameter \p pred is
@@ -11379,7 +11379,7 @@ scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -11420,7 +11420,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -11461,7 +11461,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -11509,7 +11509,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -11554,7 +11554,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -11597,7 +11597,7 @@ scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 /// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -11907,6 +11907,66 @@ prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
   prefetch<T, N, VS>(p, byte_offsets.read(), props);
 }
 
+/// template <int VS = 1, typename T, int N, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_properties_t>
+/// void prefetch(const T *p, OffsetSimdViewT byte_offsets,
+///             simd_mask<N / VS> mask, PropertyListT props = {});
+/// Supported platforms: DG2, PVC only.
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Prefetches elements of the type 'T' from memory locations
+/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the
+/// cache. Access to any element's memory location can be disabled via the input
+/// vector of predicates \p mask. If mask[i] is unset, then the load from (p +
+/// byte_offsets[i]) is skipped.
+/// @tparam VS Vector size. It can also be read as the number of reads per
+/// each address. The parameter 'N' must be divisible by 'VS'.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only cache hint
+/// properties are used.
+template <
+    int VS = 1, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
+         PropertyListT props = {}) {
+  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
+}
+
+/// template <int VS = 1, typename T, int N, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_properties_t>
+/// void prefetch(const T *p, OffsetSimdViewT byte_offsets,
+///             PropertyListT props = {});
+/// Supported platforms: DG2, PVC only.
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Prefetches elements of the type 'T' from memory locations
+/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the
+/// cache.
+/// @tparam VS Vector size. It can also be read as the number of reads per
+/// each address. The parameter 'N' must be divisible by 'VS'.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// @param props The optional compile-time properties. Only cache hint
+/// properties are used.
+template <
+    int VS = 1, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
+  prefetch<T, N, VS>(p, byte_offsets.read(), props);
+}
+
 /// template <typename T, int VS = 1, typename OffsetT,
 ///           typename PropertyListT = empty_properties_t>
 /// void prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask,
diff --git a/sycl/test/esimd/memory_properties_prefetch_2d.cpp b/sycl/test/esimd/memory_properties_prefetch_2d.cpp
index 1d98e9be4a582..c9c5f33854057 100644
--- a/sycl/test/esimd/memory_properties_prefetch_2d.cpp
+++ b/sycl/test/esimd/memory_properties_prefetch_2d.cpp
@@ -82,26 +82,34 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf,
 
   // 1) prefetch(usm, offsets): offsets is simd or simd_view
 
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
   prefetch(ptrf, ioffset_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), props_cache_load);
+  prefetch(ptrf, ioffset_n32_view, props_cache_load);
+  prefetch(ptrf, ioffset_n32_view.select<32, 1>(), props_cache_load);
 
   prefetch(ptrf, loffset_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), props_cache_load);
+  prefetch(ptrf, loffset_n32_view, props_cache_load);
+  prefetch(ptrf, loffset_n32_view.select<32, 1>(), props_cache_load);
 
   // 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
   prefetch(ptrf, ioffset_n32, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), mask_n32,
                       props_cache_load);
+  prefetch(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
+  prefetch(ptrf, ioffset_n32_view.select<32, 1>(), mask_n32, props_cache_load);
 
   prefetch(ptrf, loffset_n32, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), mask_n32,
                       props_cache_load);
+  prefetch(ptrf, loffset_n32_view, mask_n32, props_cache_load);
+  prefetch(ptrf, loffset_n32_view.select<32, 1>(), mask_n32, props_cache_load);
 
   // 3) prefetch(usm, offset): offset is scalar
   // CHECK-COUNT-16: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
@@ -128,27 +136,37 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf,
                                    props_cache_load_align);
 
   // 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
   prefetch<float, 32, 2>(ptrf, ioffset_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(),
                          props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view, props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view.select<16, 1>(), props_cache_load);
 
   prefetch<float, 32, 2>(ptrf, loffset_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(),
                          props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view, props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view.select<16, 1>(), props_cache_load);
 
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
   prefetch<float, 32, 2>(ptrf, ioffset_n16, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(), mask_n16,
                          props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view.select<16, 1>(), mask_n16,
+              props_cache_load);
 
   prefetch<float, 32, 2>(ptrf, loffset_n16, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(), mask_n16,
                          props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view, mask_n16, props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view.select<16, 1>(), mask_n16,
+              props_cache_load);
 
   // CHECK-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
   __ESIMD_NS::prefetch<float, 32>(ptrf, 0, props_cache_load);

From 037c67f182848f2e476427d5ac69fdba99237d3c Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 5 Jun 2024 19:10:14 -0400
Subject: [PATCH 16/55] [SYCL] Disable flaky test EnqueueNoMemObjTwoHostTasks
 on Windows (#14061)

See https://github.com/intel/llvm/issues/14060

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index fc816d1a4f3af..414f58c6f177c 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -165,7 +165,13 @@ class DependsOnTests : public ::testing::Test {
   };
 };
 
+#ifdef _WIN32
+// Disabled on Windows due to flaky behavior
+// https://github.com/intel/llvm/issues/14060
+TEST_F(DependsOnTests, DISABLED_EnqueueNoMemObjTwoHostTasks) {
+#else
 TEST_F(DependsOnTests, EnqueueNoMemObjTwoHostTasks) {
+#endif
   // Checks enqueue of two dependent host tasks
   detail::QueueImplPtr QueueHostImpl = MS.getDefaultHostQueue();
   std::vector<EventImplPtr> Events;

From 89225cef7199df3c31375adcf187d3667c582aa3 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 5 Jun 2024 23:32:12 -0700
Subject: [PATCH 17/55] [SYCL][Joint Matrix] Test combinations are queried Part
 3 (#13991)

Supported matrix dimensions are queried from the device, and inform the
tests which tile sizes one can use.

This is a subset of all tests that are planned to be modified.

Test manually tested on PVC and SPR

---------

Co-authored-by: Yury Plyakhin <yury.plyakhin@intel.com>
---
 .../Matrix/SG32/get_coord_int8_matB.cpp       |   9 +-
 .../SG32/joint_matrix_bfloat16_array.cpp      |   8 +-
 .../Matrix/SG32/joint_matrix_transposeC.cpp   |   5 +-
 .../Matrix/XMX8/get_coord_int8_matB.cpp       |  22 ----
 .../XMX8/joint_matrix_bfloat16_32x64.cpp      |  22 ----
 .../XMX8/joint_matrix_bfloat16_array.cpp      |  17 ---
 .../XMX8/joint_matrix_opt_kernel_feature.cpp  |  10 --
 .../Matrix/XMX8/joint_matrix_transposeC.cpp   |  18 ---
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp  |   7 +-
 .../Matrix/get_coord_int8_matB_impl.hpp       | 123 +++++++++++-------
 .../Matrix/joint_matrix_bfloat16_array.cpp    |   5 +-
 .../joint_matrix_bfloat16_array_impl.hpp      | 121 ++++++++++-------
 .../joint_matrix_opt_kernel_feature.cpp       |   2 +-
 .../Matrix/joint_matrix_transposeC.cpp        |   5 +-
 .../Matrix/joint_matrix_transposeC_impl.hpp   |  95 +++++++++-----
 15 files changed, 233 insertions(+), 236 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
index 5b77ec89fd997..80e0c0c6b845d 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
@@ -13,12 +15,7 @@
 // XFAIL: cpu
 
 #include "../common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
index f9a113af731a5..87fd837446618 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
@@ -5,16 +5,16 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "../common.hpp"
-#include <cstddef>
 
-constexpr std::size_t SG_SZ = 32;
-static constexpr int TN = 16;
+#define SG_SZ 32
 
 #include "../joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
index 214dd10f5158f..6cea5a248e0b2 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
@@ -14,6 +16,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
deleted file mode 100644
index 4c4d6c6eb5765..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==----------- get_coord_int8_matB.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-// XFAIL: *
-
-#include "../common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-constexpr size_t TN = 8;
-
-#include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
deleted file mode 100644
index 5a41f19bc2ac1..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==----- joint_matrix_bfloat16_32x64.cpp  - DPC++ joint_matrix-------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: *
-
-#include "../common.hpp"
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bfloat16_32x64_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
deleted file mode 100644
index 09c1a4ae32a92..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//==-------- joint_matrix_bfloat16_array.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-static constexpr int TN = 8;
-
-#include "../joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
deleted file mode 100644
index 30b3522ad2442..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Test checks that exception will be thrown in case matrix parameters are
-// incompatible on the current device
-
-#include "../common.hpp"
-#include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
deleted file mode 100644
index a0a98e3f16d0c..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//==----------- joint_matrix_transposeC.cpp  - DPC++ joint_matrix-----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-// REQUIRES-INTEL-DRIVER: lin: 28267
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index ad064fd82fc0a..0b7f520888fd1 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -5,14 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
+// XFAIL: cpu, gpu-intel-dg2
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
index 480f01ca77ceb..8b63dadc029b3 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
@@ -5,22 +5,23 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include <sycl/atomic_ref.hpp>
 #include <sycl/group_algorithm.hpp>
 
-constexpr size_t TK = 32;
-constexpr size_t VF = 4;
+template <size_t TileRows, size_t TileCols> class add_cols;
 
-template <typename T, size_t K, size_t N>
-void sum_cols_ref(host_accessor<T, 2, access::mode::read_write> B,
-                  host_accessor<int, 1, access::mode::read_write> sum_cols) {
-  int sum_cols_ref[N] = {0};
-  for (size_t j = 0; j < N; j++) {
-    for (size_t i = 0; i < K; i++) {
+template <typename T, typename TResult, size_t Rows, size_t Cols>
+void sum_cols_ref(
+    host_accessor<T, 2, access::mode::read_write> B,
+    host_accessor<TResult, 1, access::mode::read_write> sum_cols) {
+  TResult sum_cols_ref[Cols] = {0};
+  for (size_t j = 0; j < Cols; j++) {
+    for (size_t i = 0; i < Rows; i++) {
       sum_cols_ref[j] += B[i][j];
     }
     auto diff = sum_cols[j] - sum_cols_ref[j];
-    assert(std::fabs(static_cast<int>(diff)) <=
-           std::numeric_limits<int>::epsilon());
+    assert(std::fabs(static_cast<TResult>(diff)) <=
+           std::numeric_limits<TResult>::epsilon());
   }
 }
 
@@ -93,26 +94,27 @@ wi [1,0] -->    i=0, [8, 0]
 
 // clang-format on
 
-template <typename T, size_t K, size_t N>
-void matrix_sum_cols(big_matrix<T, K, N> &B,
-                     big_matrix<T, K / VF, N * VF> &Bvnni) {
-  buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<int8_t, 2> bufBvnni(Bvnni.get_data(), range<2>(K / VF, N * VF));
+template <typename T, typename TResult, size_t Rows, size_t Cols,
+          size_t TileRows, size_t TileCols, size_t VNNI>
+void matrix_sum_cols(big_matrix<T, Rows, Cols> &B,
+                     big_matrix<T, Rows / VNNI, Cols * VNNI> &Bvnni) {
+  buffer<T, 2> bufB(B.get_data(), range<2>(Rows, Cols));
+  buffer<T, 2> bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI));
 
-  int sum_cols[N] = {0};
-  buffer<int> sum_cols_v(sum_cols, N);
+  TResult sum_cols[Cols] = {0};
+  buffer<TResult> sum_cols_v(sum_cols, Cols);
 
-  size_t NDRangeK = K / TK;
-  size_t NDRangeN = N / TN;
+  size_t NDRangeK = Rows / TileRows;
+  size_t NDRangeN = Cols / TileCols;
   queue q;
-  size_t sg_size = get_sg_size<class sum>(q);
+  size_t sg_size = get_sg_size<add_cols<TileRows, TileCols>>(q);
   nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-     auto accB = bufBvnni.get_access<access::mode::read_write>(cgh);
-     auto v = sum_cols_v.get_access<access::mode::atomic>(cgh);
+     sycl::accessor accB{bufBvnni, cgh, sycl::read_write};
+     sycl::accessor v{sum_cols_v, cgh, sycl::read_write};
 
-     cgh.parallel_for<class sum>(
+     cgh.parallel_for<add_cols<TileRows, TileCols>>(
          r, [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
                 [[intel::reqd_sub_group_size(SG_SZ)]]
@@ -125,57 +127,88 @@ void matrix_sum_cols(big_matrix<T, K, N> &B,
 
            sycl::sub_group sg = spmd_item.get_sub_group();
 
-           joint_matrix<sub_group, int8_t, use::b, TK, TN,
+           joint_matrix<sub_group, T, use::b, TileRows, TileCols,
                         layout::ext_intel_packed>
                sub_b;
 
            joint_matrix_load(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * (TK / VF) * N * VF) +
-                   sg_starty / sg_size * TN * VF,
-               N * VF);
+                   (sg_startx * (TileRows / VNNI) * Cols * VNNI) +
+                   sg_starty / sg_size * TileCols * VNNI,
+               Cols * VNNI);
 
-           int32_t sum_local_cols[N] = {0};
+           TResult sum_local_cols[Cols] = {0};
            ext::intel::experimental::matrix::joint_matrix_apply(
-               sg, sub_b, [&](int8_t &x, size_t row, size_t col) {
-                 // the coordinates returned are in the logical range [K,N]
-                 // If users want to retrieve the VNNIed coordinates, they can
-                 // be obtained using colVNNI = col/VF rowVNNI = row*VF
-                 size_t global_index = col + global_idy / sg_size * TN;
+               sg, sub_b, [&](T &x, size_t row, size_t col) {
+                 // the coordinates returned are in the logical range
+                 // [Rows,Cols] If users want to retrieve the VNNIed
+                 // coordinates, they can be obtained using colVNNI = col/VNNI
+                 // rowVNNI = row*VNNI
+                 size_t global_index = col + global_idy / sg_size * TileCols;
                  sum_local_cols[global_index] += x;
                });
 
-           for (int i = 0; i < N; i++) {
+           for (int i = 0; i < Cols; i++) {
              sum_local_cols[i] =
                  reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
-             if (global_idy % sg_size == 0)
-               atomic_fetch_add(v[i], sum_local_cols[i]);
+             if (global_idy % sg_size == 0) {
+               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                   aref(v[i]);
+               aref.fetch_add(sum_local_cols[i]);
+             }
            }
          }); // parallel for
    }).wait();
-  sum_cols_ref<T, K, N>(bufB.get_host_access(), sum_cols_v.get_host_access());
+  sum_cols_ref<T, TResult, Rows, Cols>(bufB.get_host_access(),
+                                       sum_cols_v.get_host_access());
 }
 
-int main() {
+template <typename T, typename TResult, size_t VNNI, size_t TK, size_t TN>
+void test() {
   static constexpr size_t scale = 2;
   static constexpr size_t MATRIX_K = TK * scale;
   static constexpr size_t MATRIX_N = TN * scale;
 
-  int8_t B[MATRIX_K][MATRIX_N];
-  big_matrix<int8_t, MATRIX_K, MATRIX_N> MB((int8_t *)&B);
+  T B[MATRIX_K][MATRIX_N];
+  big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
 
-  int8_t Bvnni[MATRIX_K / VF][MATRIX_N * VF];
-  big_matrix<int8_t, MATRIX_K / VF, MATRIX_N * VF> MBvnni((int8_t *)&Bvnni);
+  T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI];
+  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MBvnni((T *)&Bvnni);
 
   for (int i = 0; i < MATRIX_K; i++) {
     for (int j = 0; j < MATRIX_N; j++) {
       B[i][j] = i + j;
     }
   }
-  matrix_vnni<int8_t>(MATRIX_K, MATRIX_N, *B, *Bvnni, VF);
+  matrix_vnni<T>(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI);
   // This test calculates sum of columns in the non VNNI B matrix
-  matrix_sum_cols<int8_t, MATRIX_K, MATRIX_N>(MB, MBvnni);
-  std::cout << "Passed\n";
+  matrix_sum_cols<T, TResult, MATRIX_K, MATRIX_N, TK, TN, VNNI>(MB, MBvnni);
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<int8_t, int32_t, 4, /*TK*/ 64, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 8>();
+      break;
+    }
+  }
   return 0;
 }
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
index 98ed155b297ad..5cd2a4dc1962f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-static constexpr int TN = 16;
-
 #include "joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index 9aefc370bd0c6..f393eaa5e8436 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -6,30 +6,28 @@
 //
 //===-------------------------------------------------------------------------===//
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
+template <typename T, size_t TM, size_t TN, size_t TK> class mult;
 
-static constexpr int TM = 8;
-static constexpr int TK = 16;
 static constexpr int JM_ARRAY_SZ = 2;
 
-template <typename T1, typename T2, size_t M, size_t N, size_t K>
-void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
-                     big_matrix<T2, K / 2, N * 2> &B) {
+template <typename TResult, typename T, size_t M, size_t N, size_t K, size_t TM,
+          size_t TN, size_t TK, size_t VNNI>
+void matrix_multiply(big_matrix<TResult, M, N> &C, big_matrix<T, M, K> &A,
+                     big_matrix<T, K / VNNI, N * VNNI> &B) {
   size_t NDRangeM = M / (TM * JM_ARRAY_SZ);
   size_t NDRangeN = N / TN;
-  buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
+  buffer<T, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<T, 2> bufB(B.get_data(), range<2>(K, N));
+  buffer<TResult, 2> bufC((TResult *)C.get_data(), range<2>(M, N));
 
   queue q;
-  size_t sg_size = get_sg_size<class imatrix>(q);
+  size_t sg_size = get_sg_size<mult<T, TM, TN, TK>>(q);
   q.submit([&](handler &cgh) {
-     auto accC = bufC.get_access<access::mode::read_write>(cgh);
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
-     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+     sycl::accessor accA{bufA, cgh, sycl::read_write};
+     sycl::accessor accB{bufB, cgh, sycl::read_write};
+     sycl::accessor accC{bufC, cgh, sycl::read_write};
 
-     cgh.parallel_for<class imatrix>(
+     cgh.parallel_for<mult<T, TM, TN, TK>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -45,25 +43,25 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
+           joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major>
                sub_a[JM_ARRAY_SZ];
 
            // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, bfloat16, use::b, TK, TN,
-                        layout::ext_intel_packed>
+           joint_matrix<sub_group, T, use::b, TK, TN, layout::ext_intel_packed>
                sub_b;
-           joint_matrix<sub_group, float, use::accumulator, TM, TN>
+           joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
                sub_c[JM_ARRAY_SZ];
 
            for (int i = 0; i < JM_ARRAY_SZ; ++i)
-             joint_matrix_fill(sg, sub_c[i], 1.0);
+             joint_matrix_fill(sg, sub_c[i], TResult(1));
 
            for (int k = 0; k < K / TK; ++k) {
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
-                 N * 2);
+                     (k * TK / VNNI) * (N * VNNI) +
+                     sg_starty / sg_size * TN * VNNI,
+                 N * VNNI);
 
              for (int i = 0; i < JM_ARRAY_SZ; ++i) {
                joint_matrix_load(
@@ -86,35 +84,70 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
    }).wait();
 }
 
-int main() {
+template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
+          size_t TK>
+void test() {
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
   static constexpr size_t MATRIX_M = TM * 2;
   static constexpr size_t MATRIX_N = TN * 2;
   static constexpr size_t MATRIX_K = TK * 2;
 
-  bfloat16 A[MATRIX_M][MATRIX_K];
-  bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
+  T A[MATRIX_M][MATRIX_K];
+  T B[MATRIX_K / VNNI][MATRIX_N * VNNI];
+
+  TResult C[MATRIX_M][MATRIX_N];
+  TResult D[MATRIX_M][MATRIX_N];
 
-  float C[MATRIX_M][MATRIX_N];
-  float D[MATRIX_M][MATRIX_N];
+  matrix_fill(MATRIX_M, MATRIX_K, (T *)A,
+              [](int i, int j) { return TResult(1) * (i + j); });
+  matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B,
+              [](int i, int j) { return TResult(2) * i + TResult(3) * j; });
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1));
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1));
 
-  matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A,
-              [](int i, int j) { return 1.0f * (i + j); });
-  matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B,
-              [](int i, int j) { return 2.0f * i + 3.0f * j; });
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f);
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f);
+  big_matrix<TResult, MATRIX_M, MATRIX_N> MC((TResult *)&C);
+  big_matrix<TResult, MATRIX_M, MATRIX_N> MD((TResult *)&D);
+  big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
+  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MB((T *)&B);
 
-  big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
-  big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
-  big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
-  big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
+  matrix_multiply<TResult, T, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK, VNNI>(
+      MC, MA, MB);
+  matrix_multiply_ref<T, T, TResult, VNNI>((T *)A, (T *)B, (TResult *)D,
+                                           MATRIX_M, MATRIX_N, MATRIX_K / VNNI);
 
-  matrix_multiply(MC, MA, MB);
-  matrix_multiply_ref<bfloat16, bfloat16, float, 2>(
-      (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N,
-      MATRIX_K / 2);
+  assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D));
+}
 
-  bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D);
-  std::cout << (res ? "passed" : "failed") << std::endl;
-  return !res;
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
+      test<bfloat16, float, 2, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
+      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
+      break;
+#endif
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
+      break;
+    }
+  }
+  return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 5acc54a412096..a6b72f80a989d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
index bd04b157cf667..b81093293cd33 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 24ba24a264f0d..278e5da5cf441 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -8,10 +8,7 @@
 
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-template <size_t TM> class LS;
+template <size_t TileRows, size_t TileCols> class LS;
 
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
@@ -24,10 +21,10 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
 
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
-  size_t sg_size = get_sg_size<class LS<TM>>(q);
+  size_t sg_size = get_sg_size<class LS<TM, TN>>(q);
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<class LS<TM>>(
+     cgh.parallel_for<class LS<TM, TN>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -51,7 +48,7 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_matrix;
 
            auto row_major_offset =
                (sg_startx * TM) * N + (sg_starty / sg_size * TN);
@@ -72,32 +69,33 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
    }).wait();
 }
 
-template <size_t TM> void run_matrix_test() {
+template <typename T, size_t TM, size_t TN> void run_matrix_test() {
   static constexpr size_t MATRIX_M = TM * 16;
   static constexpr size_t MATRIX_N = TN * 16;
 
   queue q;
-  float *input = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *out_col_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *out_row_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *ref_col_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
+  T *input = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *out_col_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *out_row_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *ref_col_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
 
   // input is column majot matrix so it is of NxM shape
-  matrix_rand(MATRIX_N, MATRIX_M, input, (float)5.0);
-  matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (float)0);
-  matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (float)0);
+  matrix_rand(MATRIX_N, MATRIX_M, input, (T)5.0);
+  matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (T)0);
+  matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (T)0);
   matrix_transpose(MATRIX_N, MATRIX_M, ref_col_major, input);
 
-  matrix_load_and_store<TM, TN, float, MATRIX_M, MATRIX_N>(input, out_col_major,
-                                                           out_row_major, q);
+  matrix_load_and_store<TM, TN, T, MATRIX_M, MATRIX_N>(input, out_col_major,
+                                                       out_row_major, q);
 
   // we use exact comparison as no low precision calculation is used in this
   // test
-  std::cout << "compare results for TM " << TM << "\n";
-  bool res = matrix_compare<float, float, true>(MATRIX_M, MATRIX_N,
-                                                out_col_major, ref_col_major) &&
-             matrix_compare<float, float, true>(MATRIX_N, MATRIX_M,
-                                                out_row_major, input);
+  std::cout << "compare results for: " << TM << " x " << TN << " [TM x TN]"
+            << std::endl;
+  bool res =
+      matrix_compare<T, T, true>(MATRIX_M, MATRIX_N, out_col_major,
+                                 ref_col_major) &&
+      matrix_compare<T, T, true>(MATRIX_N, MATRIX_M, out_row_major, input);
   free(input, q);
   free(out_col_major, q);
   free(out_row_major, q);
@@ -106,15 +104,48 @@ template <size_t TM> void run_matrix_test() {
 }
 
 int main() {
-  run_matrix_test<8>();
-  run_matrix_test<7>();
-  run_matrix_test<6>();
-  run_matrix_test<5>();
-  run_matrix_test<4>();
-  run_matrix_test<3>();
-  run_matrix_test<2>();
-  run_matrix_test<1>();
-
-  std::cout << "Passed\n";
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 8>();
+      break;
+    }
+  }
   return 0;
 }

From ad84669750280610f6b3331142b0a638a2fefd2f Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 5 Jun 2024 23:34:01 -0700
Subject: [PATCH 18/55] [SYCL][Joint Matrix] Test combinations are queried Part
 4 (#14019)

Supported matrix dimensions are queried from the device, and inform the
tests which tile sizes one can use.

This is a subset of all tests that are planned to be modified.

Test manually tested on PVC and SPR - no new regresssions

The following tests have been marked as XFAIL on all platforms. I
removed them from XMX8 folder. Once they are passing then they can be
modified to query the supported matrix dimensions form the device.
* joint_matrix_colA_rowB_colC.cpp
* joint_matrix_out_bounds.cpp
* joint_matrix_unaligned_k.cpp
---
 .../SG32/joint_matrix_bf16_fill_k_cache.cpp   |  13 +-
 .../joint_matrix_bf16_fill_k_cache_init.cpp   |   6 +-
 .../joint_matrix_bf16_fill_k_cache_unroll.cpp |  13 +-
 ...t_matrix_bf16_fill_k_cache_unroll_init.cpp |   6 +-
 .../XMX8/joint_matrix_bf16_fill_k_cache.cpp   |  20 --
 .../joint_matrix_bf16_fill_k_cache_init.cpp   |  20 --
 .../joint_matrix_bf16_fill_k_cache_unroll.cpp |  22 --
 ...t_matrix_bf16_fill_k_cache_unroll_init.cpp |  22 --
 .../XMX8/joint_matrix_colA_rowB_colC.cpp      |  19 -
 .../Matrix/XMX8/joint_matrix_out_bounds.cpp   |  20 --
 .../Matrix/XMX8/joint_matrix_unaligned_k.cpp  |  20 --
 .../Matrix/joint_matrix_bf16_fill_k_cache.cpp |  13 +-
 .../joint_matrix_bf16_fill_k_cache_OOB.cpp    |   5 -
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 325 ++++++++----------
 .../joint_matrix_bf16_fill_k_cache_init.cpp   |   6 +-
 .../joint_matrix_bf16_fill_k_cache_unroll.cpp |  13 +-
 ...t_matrix_bf16_fill_k_cache_unroll_init.cpp |   6 +-
 17 files changed, 166 insertions(+), 383 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
index b93985f8e594e..10334f93afa80 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
@@ -5,21 +5,18 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
-// RUN: %{build} -o %t_gpu.out -ffp-model=precise
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build}  -ffp-model=precise -o %t_cpu.out -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -o %t.out -ffp-model=precise
+// RUN: %{run} %t.out
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
index 10391f2e7e319..32af965ec431a 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
@@ -14,9 +16,7 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 994a2217d681f..1c7533e331e73 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -5,23 +5,20 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
-// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DMANUAL_UNROLL
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_cpu.out -DMANUAL_UNROLL -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
+// RUN: %{run} %t.out
 
 // -mllvm -inline-threshold added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 4f7e3638daaf3..f8d30cdc26756 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL
@@ -16,9 +18,7 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
deleted file mode 100644
index fbcd21be62f75..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==--- joint_matrix_bf16_fill_k_cache.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out -ffp-model=precise
-// RUN: %{run} %t.out
-
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
deleted file mode 100644
index c5e399bc98f48..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//== joint_matrix_bf16_fill_k_cache_init.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
-// RUN: %{run} %t.out
-
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
deleted file mode 100644
index ba24ea0dfc4b8..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===joint_matrix_bf16_fill_k_cache_unroll.cpp - DPC++ joint_matrix--------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
-// RUN: %{run} %t.out
-
-// -mllvm -inline-threshold=2000 added as a workaround,
-// since IGC doesn't support some variants of IR for Joint Matrix currently
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
deleted file mode 100644
index 9d88c89c50f41..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==joint_matrix_bf16_fill_k_cache_unroll_init.cpp  - DPC++ joint_matrix----==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DINIT_LIST -DMANUAL_UNROLL
-// RUN: %{run} %t.out
-
-// -mllvm -inline-threshold=2000 added as a workaround,
-// since IGC doesn't support some variants of IR for Joint Matrix currently
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
deleted file mode 100644
index 7d74bf8055d6b..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//==---------- joint_matrix_colA_rowB_colC.cpp - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL:gpu
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
deleted file mode 100644
index 0ba69032465b9..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL:*
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-static constexpr size_t MATRIX_K = 1024 + 24;
-
-#include "../joint_matrix_out_bounds_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
deleted file mode 100644
index f42f37378514d..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix--------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL:*
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-constexpr size_t MATRIX_K = 1024 + 14;
-
-#include "../joint_matrix_out_bounds_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
index abee7d7259f28..2be4c14615799 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
@@ -5,19 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
-// RUN: %{build} -o %t_gpu.out -ffp-model=precise
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build}  -ffp-model=precise -o %t_cpu.out -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -o %t.out -ffp-model=precise
+// RUN: %{run} %t.out
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
index 46d0acd79a1b5..4d84656c3d451 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
@@ -16,9 +16,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-#define SG_SZ 16
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index e389ea7137428..56250cf9fb3e1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -7,14 +7,8 @@
 //===-------------------------------------------------------------------------===//
 
 #include <random>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/matrix/matrix.hpp>
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
 // number of test iterations
 constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
@@ -24,36 +18,6 @@ constexpr unsigned int recordThresh = 10;
 #define MATRIX_SIZE 256
 #endif
 
-#ifndef tM
-#define tM 8
-#endif
-#ifndef tN
-#define tN TN
-#endif
-#ifndef tK
-#define tK 16
-#endif
-
-#ifndef MCACHE1
-#define MCACHE1 32
-#endif
-#ifndef NCACHE1
-#define NCACHE1 (TN * 4)
-#endif
-#ifndef KCACHE1
-#define KCACHE1 16
-#endif
-
-#ifndef MCACHE2
-#define MCACHE2 256
-#endif
-#ifndef NCACHE2
-#define NCACHE2 256
-#endif
-#ifndef KCACHE2
-#define KCACHE2 32
-#endif
-
 #ifdef MANUAL_UNROLL
 template <class T, T... inds, class F>
 static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
@@ -66,25 +30,28 @@ static constexpr void manually_unroll_loop(F &&f) {
 }
 #endif
 
-template <unsigned int rowsA, unsigned int colsA, unsigned int rowsB,
-          unsigned int colsB, unsigned int vnniFactor, typename TOperand,
-          typename TResult>
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB, size_t VNNI,
+          typename TOperand, typename TResult, size_t TM, size_t TN, size_t TK,
+          size_t MCache1, size_t NCache1, size_t KCache1, size_t MCache2,
+          size_t NCache2, size_t KCache2>
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
-  size_t sgSize = get_sg_size<class MatMul>(q);
-  range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize};
-  range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize};
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
 
   // throw error if padding needed
   assert(colsA == rowsB);
-  assert(rowsA % tM == 0);
-  assert(colsA % tK == 0);
-  assert(colsB % tN == 0);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
   // submit main kernel
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
   q.submit([&](handler &h) {
-    h.parallel_for<class MatMul>( // cache layer#1
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
         // loop localrange
@@ -107,33 +74,20 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           auto m1 = it.get_local_id(0);
           auto n1 = it.get_local_id(1) / sgSize;
           auto sg = it.get_sub_group();
-          joint_matrix<sub_group, TResult, use::accumulator, tM, tN>
-              tC[MCACHE1 / tM][NCACHE1 / tN]
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN]
 #ifdef INIT_LIST
-              = {joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>()}
+              = {}; // default initialization of all array elements
+#else
+              ; // no initialization
 #endif
-          ;
+
 #ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
-            manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-          for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
-            for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
               joint_matrix_fill(sg, tC[m][n], 0);
 #ifdef MANUAL_UNROLL
@@ -144,75 +98,45 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           }
 #endif
 
-          for (unsigned int k2 = 0; k2 < colsA / KCACHE2; k2++) {
-            joint_matrix<sub_group, TOperand, use::a, tM, tK, layout::row_major>
-                tA[MCACHE1 / tM][KCACHE2 / KCACHE1]
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1]
 #ifdef INIT_LIST
-                = {joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>()}
+                = {}; // default initialization of all array elements
+#else
+                ; // no initialization
 #endif
-            ;
 
-            joint_matrix<sub_group, TOperand, use::b, tK, tN,
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
                          layout::ext_intel_packed>
-                tB[NCACHE1 / tN][KCACHE2 / KCACHE1]
+                tB[NCache1 / TN][KCache2 / KCache1]
 #ifdef INIT_LIST
-                =
-                    {
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                    }
+                = {}; // default initialization of all array elements
+#else
+                ; // no initialization
 #endif
-            ;
+
 #ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, KCACHE2 / KCACHE1>([&](auto k1) {
+            manually_unroll_loop<unsigned int, KCache2 / KCache1>([&](auto k1) {
 #else
-            for (unsigned int k1 = 0; k1 < KCACHE2 / KCACHE1; k1++) {
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
 #endif
               // physical layer
-              unsigned int k = (k2 * KCACHE2 + k1 * KCACHE1) / tK;
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-              for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tA[m][k1], pA, colsA, rowsA, colsA,
-                    m2 * MCACHE2 + m1 * MCACHE1 + m * tM, k * tK);
+                    m2 * MCache2 + m1 * MCache1 + m * TM, k * TK);
 #else
                 joint_matrix_load(
                     sg, tA[m][k1],
-                    pA + (m2 * MCACHE2 + m1 * MCACHE1 + m * tM) * colsA +
-                        k * tK,
+                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                        k * TK,
                     colsA);
 #endif
 #ifdef MANUAL_UNROLL
@@ -221,21 +145,21 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // m
 #endif
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+              manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-              for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+              for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
-                    colsB * vnniFactor, k * tK / vnniFactor,
-                    (n2 * NCACHE2 + n1 * NCACHE1 + n * tN) * vnniFactor);
+                    sg, tB[n][k1], pB, colsB * VNNI, rowsB / VNNI, colsB * VNNI,
+                    k * TK / VNNI,
+                    (n2 * NCache2 + n1 * NCache1 + n * TN) * VNNI);
 #else
-                joint_matrix_load(
-                    sg, tB[n][k1],
-                    pB + (k * tK / vnniFactor) * (colsB * vnniFactor) +
-                        (n2 * NCACHE2 + n1 * NCACHE1 + n * tN) * vnniFactor,
-                    colsB * vnniFactor);
+                joint_matrix_load(sg, tB[n][k1],
+                                  pB + (k * TK / VNNI) * (colsB * VNNI) +
+                                      (n2 * NCache2 + n1 * NCache1 + n * TN) *
+                                          VNNI,
+                                  colsB * VNNI);
 #endif
 #ifdef MANUAL_UNROLL
               });
@@ -243,14 +167,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // n
 #endif
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-              for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef MANUAL_UNROLL
-                manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+                manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-                for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
 
 #endif
                   joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
@@ -266,25 +190,25 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #endif
           } // for k2
 #ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-          for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-            for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
 #ifdef OOB
               ext::intel::experimental::matrix::joint_matrix_store_checked(
                   sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB,
-                  m2 * MCACHE2 + m1 * MCACHE1 + m * tM,
-                  n2 * NCACHE2 + n1 * NCACHE1 + n * tN);
+                  m2 * MCache2 + m1 * MCache1 + m * TM,
+                  n2 * NCache2 + n1 * NCache1 + n * TN);
 #else
               joint_matrix_store(
                   sg, tC[m][n],
-                  pC + (m2 * MCACHE2 + m1 * MCACHE1 + m * tM) * colsB +
-                      (n2 * NCACHE2 + n1 * NCACHE1 + n * tN),
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
                   colsB, layout::row_major);
 #endif
 #ifdef MANUAL_UNROLL
@@ -305,60 +229,46 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   return duration.count();
 }
 
-void fill_matrix(bfloat16 *M) {
-  std::random_device dev;
-  std::uniform_real_distribution<float> fdistr(-1.0, 1.0);
-  for (unsigned int i = 0; i < MATRIX_SIZE; i++) {
-    for (unsigned int j = 0; j < MATRIX_SIZE; j++) {
-      M[i * MATRIX_SIZE + j] = bfloat16(fdistr(dev));
-    }
-  }
-}
-
-void native_matmul(bfloat16 *A, bfloat16 *B, float *C) {
-  memset(C, 0, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE);
-  for (unsigned int i = 0; i < MATRIX_SIZE; i++) {
-    for (unsigned int k = 0; k < MATRIX_SIZE; k++) {
-      for (unsigned int j = 0; j < MATRIX_SIZE; j++) {
-        C[i * MATRIX_SIZE + j] += make_fp32(A[i * MATRIX_SIZE + k]) *
-                                  make_fp32(B[k * MATRIX_SIZE + j]);
-      }
-    }
-  }
-}
-
-int main(void) {
-  assert(MATRIX_SIZE >= tM && MATRIX_SIZE >= tK && MATRIX_SIZE >= tN &&
+template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test() {
+  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
          "invalid matrix size");
-  assert((MATRIX_SIZE % tM) == 0 && (MATRIX_SIZE % tN) == 0 &&
-         (MATRIX_SIZE % tK) == 0 &&
-         "invalid matrix size detected: not a multiple of <tM,tN,tK>");
+  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
+         (MATRIX_SIZE % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
 
   queue q;
-  bfloat16 *A = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  bfloat16 *B = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  bfloat16 *vnniB = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  float *C = malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, q);
-  float *refC = malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, q);
-
-  // Initialize; fill matrices
-  fill_matrix(A);
-  fill_matrix(B);
-  matrix_vnni<bfloat16>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, 2);
-  native_matmul(A, B, refC);
+  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
+  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI);
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
+                                        MATRIX_SIZE);
 
   // run testIterations time, aggregate and calculate average run time
   double totalDuration = 0;
   for (unsigned int i = 0; i < testIterations; i++) {
     double duration =
-        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, 2,
-                     bfloat16, float>(A, vnniB, C, q, i);
+        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, VNNI,
+                     T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2,
+                     NCache2, KCache2>(A, vnniB, C, q, i);
     if (i >= recordThresh) {
       totalDuration += duration;
     }
   }
 
-  bool result = matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC);
+  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
 
   double msecPerMatrixMul =
       totalDuration / static_cast<double>(testIterations - recordThresh);
@@ -373,6 +283,55 @@ int main(void) {
   free(vnniB, q);
   free(C, q);
   free(refC, q);
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
 
-  return !result;
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
+           KCache1, MCache2, NCache2, KCache2>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
+           KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+  }
+  return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
index d839f3db8f481..0770e7881edc7 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// REQUIRES: aspect-ext_intel_matrix, gpu
 
 // RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
 // RUN: %{run} %t.out
@@ -13,8 +13,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 1800901e24111..4f5616d7e7f4f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -5,21 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DMANUAL_UNROLL
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_cpu.out -DMANUAL_UNROLL -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
+// RUN: %{run} %t.out
 
 // -mllvm -inline-threshold=2000 added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 701c17741f576..ff4c29251200d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// REQUIRES: aspect-ext_intel_matrix, gpu
 
 // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL
 // RUN: %{run} %t_gpu.out
@@ -15,8 +15,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"

From 09c93842ffe51602e118504e4e3229d41b2a4fb2 Mon Sep 17 00:00:00 2001
From: Guo Yejun <yejun.guo@intel.com>
Date: Thu, 6 Jun 2024 15:45:23 +0800
Subject: [PATCH 19/55] [SYCL][Graph] Clarify graph enable_profiling property
 in finalize() (#14067)

Co-authored-by: Ewan Crawford <ewan.cr@gmail.com>
---
 .../extensions/experimental/sycl_ext_oneapi_graph.asciidoc  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
index 961f87462af6c..77fab2ebe5fb1 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1047,9 +1047,11 @@ Constraints:
 
 Parameters:
 
-* `propList` - Optional parameter for passing properties. The only property
-  that is valid to pass here is `property::graph::updatable`, to enable the
+* `propList` - Optional parameter for passing properties. Two properties
+  are valid to pass here. One is `property::graph::updatable` to enable the
   returned executable graph to be <<executable-graph-update, updated>>.
+  The other is <<enable-profiling, `property::graph::enable_profiling`>>
+  to enable profiling events returned from submissions of the executable graph.
 
 Returns: A new executable graph object which can be submitted to a queue.
 

From 97ed50d364d7749f209416c01aee4e4e0ecc245e Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Thu, 6 Jun 2024 10:23:48 +0200
Subject: [PATCH 20/55] [SYCL] Force-emit more member functions into device
 code (#13985)

By some reason, we used to only emit unused member functions if they
are explicitly annotated with `sycl_device` attribute (through
`SYCL_EXTERNAL` macro).

This logic was introduced in 3baec1873322778f88446cd22a67b6594cf57968
and there is no clear indication as to why exactly we have a check that
the attribute is explicit.

SYCL extension for virtual functions introduces an alternative markup
for specifying which function and that markup is SYCL compile-time
properties that we turn into attributes implicitly under the hood.

Essentially, we now have a situation where an implicit `sycl_device`
attribute on a member function should be treated as an explicit one,
because it could be a result of SYCL compile-time property being applied
to that method.

Considering our current codebase, it seems like we intend to
have member function to be emitted in all cases where
`sycl_device` is being implicitly added and therefore this patch removes
the requirement for the attribute to be explicit.
---
 clang/lib/AST/ASTContext.cpp                  |  8 ++--
 .../force-emit-device-virtual-funcs.cpp       | 47 +++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0777046f145bb..654454387e22c 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12035,10 +12035,10 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) {
     // or `indirectly_callable' attribute must be emitted regardless of number
     // of actual uses
     if (LangOpts.SYCLIsDevice && isa<CXXMethodDecl>(D)) {
-      if (auto *A = D->getAttr<SYCLDeviceIndirectlyCallableAttr>())
-        return !A->isImplicit();
-      if (auto *A = D->getAttr<SYCLDeviceAttr>())
-        return !A->isImplicit();
+      if (D->hasAttr<SYCLDeviceIndirectlyCallableAttr>())
+        return true;
+      if (D->hasAttr<SYCLDeviceAttr>())
+        return true;
     }
 
     GVALinkage Linkage = GetGVALinkageForFunction(FD);
diff --git a/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp
new file mode 100644
index 0000000000000..fb34cae42d9ae
--- /dev/null
+++ b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple spir64-unknown-unknown -fsycl-is-device \
+// RUN:     -fsycl-allow-virtual-functions -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s --input-file=%t.ll --implicit-check-not _ZN7Derived3baz \
+// RUN:     --implicit-check-not _ZN4Base4baz --implicit-check-not _ZN4Base3foo
+//
+// Some SYCL properties may be turned into 'sycl_device' attribute implicitly
+// and we would like to ensure that functions like this (at the moment those
+// would be virtual member functions only) are forcefully emitted into device
+// code.
+
+class Base {
+  virtual void foo() {}
+
+  virtual void baz();
+
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "a")]]
+  virtual void bar();
+};
+
+void Base::bar() {}
+
+void Base::baz() {}
+
+class Derived : public Base {
+public:
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "b")]]
+  void foo() override;
+
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "c")]]
+  void bar() override final;
+
+  [[__sycl_detail__::add_ir_attributes_function("not-indirectly-callable", "c")]]
+  void baz() override final;
+};
+
+void Derived::foo() {}
+
+void Derived::bar() {}
+
+void Derived::baz() {}
+
+// CHECK: define {{.*}}spir_func void @_ZN4Base3bar{{.*}} #[[#AttrA:]]
+// CHECK: define {{.*}}spir_func void @_ZN7Derived3foo{{.*}} #[[#AttrB:]]
+// CHECK: define {{.*}}spir_func void @_ZN7Derived3bar{{.*}} #[[#AttrC:]]
+// CHECK: attributes #[[#AttrA]] = {{.*}} "indirectly-callable"="a"
+// CHECK: attributes #[[#AttrB]] = {{.*}} "indirectly-callable"="b"
+// CHECK: attributes #[[#AttrC]] = {{.*}} "indirectly-callable"="c"

From 71a5e373ddd865a498eeb5d32adb0b83b679b38a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Thu, 6 Jun 2024 10:19:37 +0100
Subject: [PATCH 21/55] [SYCL][COMPAT] Added filter_device and list_devices
 (#14016)

This PR adds functionalities for:
* Listing devices in stdout
* Filtering devices

Tests and docs updated accordingly.

---------

Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com>
Co-authored-by: Joe Todd <joe.todd@codeplay.com>
---
 sycl/doc/syclcompat/README.md                 | 15 ++++
 sycl/include/syclcompat/device.hpp            | 70 ++++++++++++++++-
 sycl/test-e2e/syclcompat/device/device.cpp    | 19 +++++
 .../syclcompat/device/device_filter.cpp       | 78 +++++++++++++++++++
 .../syclcompat/device/device_fixt.hpp         | 29 +++++++
 5 files changed, 209 insertions(+), 2 deletions(-)
 create mode 100644 sycl/test-e2e/syclcompat/device/device_filter.cpp

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 6b776cfb777b3..3ffde6f224493 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -844,6 +844,15 @@ static inline sycl::context get_default_context();
 // Util function to get a CPU device.
 static inline device_ext &cpu_device();
 
+/// Filter out devices; only keep the device whose name contains one of the
+/// subname in \p dev_subnames.
+/// May break device id mapping and change current device. It's better to be
+/// called before other SYCLcompat or SYCL APIs.
+static inline void filter_device(const std::vector<std::string> &dev_subnames);
+
+/// Print all the devices (and their IDs) in the dev_mgr
+static inline void list_devices();
+
 // Util function to select a device by its id
 static inline unsigned int select_device(unsigned int id);
 
@@ -868,6 +877,12 @@ can be queried through `device_ext` as well.
 throws a `sycl::exception` if the device does not have the specified list of
 `sycl::aspect`.
 
+Devices can be listed and filtered using `syclcompat::list_devices()` and
+`syclcompat::filter_device()`. If `SYCLCOMPAT_VERBOSE` is defined at compile
+time, the available SYCL devices are printed to the standard output both at
+initialization time, and when the device list is filtered using
+`syclcompat::filter_device`.
+
 Users can manage queues through the `syclcompat::set_default_queue(sycl::queue
 q)` free function, and the `device_ext` `set_saved_queue`, `set_default_queue`,
 and `get_saved_queue` member functions.
diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp
index 399efbd8b8933..080fac3ef5275 100644
--- a/sycl/include/syclcompat/device.hpp
+++ b/sycl/include/syclcompat/device.hpp
@@ -726,14 +726,64 @@ class dev_mgr {
   unsigned int device_count() { return _devs.size(); }
 
   unsigned int get_device_id(const sycl::device &dev) {
+    if (!_devs.size()) {
+      throw std::runtime_error(
+          "[SYCLcompat] No SYCL devices found in the device list. Device list "
+          "may have been filtered by syclcompat::filter_device");
+    }
     unsigned int id = 0;
     for (auto dev_item : _devs) {
       if (*dev_item == dev) {
-        break;
+        return id;
       }
       id++;
     }
-    return id;
+    throw std::runtime_error("[SYCLcompat] The device[" +
+                             dev.get_info<sycl::info::device::name>() +
+                             "] is filtered out by syclcompat::filter_device "
+                             "in current device list!");
+  }
+
+  /// List all the devices with its id in dev_mgr.
+  void list_devices() const {
+    for (size_t i = 0; i < _devs.size(); ++i) {
+      std::cout << "Device " << i << ": "
+                << _devs[i]->get_info<sycl::info::device::name>() << std::endl;
+    }
+  }
+
+  /// Filter out devices; only keep the device whose name contains one of the
+  /// subname in \p dev_subnames.
+  /// May break device id mapping and change current device. It's better to be
+  /// called before other SYCLcompat/SYCL APIs.
+  void filter(const std::vector<std::string> &dev_subnames) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto iter = _devs.begin();
+    while (iter != _devs.end()) {
+      std::string dev_name = (*iter)->get_info<sycl::info::device::name>();
+      bool matched = false;
+      for (const auto &name : dev_subnames) {
+        if (dev_name.find(name) != std::string::npos) {
+          matched = true;
+          break;
+        }
+      }
+      if (matched)
+        ++iter;
+      else
+        iter = _devs.erase(iter);
+    }
+    _cpu_device = -1;
+    for (unsigned i = 0; i < _devs.size(); ++i) {
+      if (_devs[i]->is_cpu()) {
+        _cpu_device = i;
+        break;
+      }
+    }
+    _thread2dev_map.clear();
+#ifdef SYCLCOMPAT_VERBOSE
+    list_devices();
+#endif
   }
 
   /// Select device with a Device Selector
@@ -779,6 +829,9 @@ class dev_mgr {
         _cpu_device = _devs.size() - 1;
       }
     }
+#ifdef SYCLCOMPAT_VERBOSE
+    list_devices();
+#endif
   }
   void check_id(unsigned int id) const {
     if (id >= _devs.size()) {
@@ -853,6 +906,19 @@ static inline device_ext &cpu_device() {
   return detail::dev_mgr::instance().cpu_device();
 }
 
+/// Filter out devices; only keep the device whose name contains one of the
+/// subname in \p dev_subnames.
+/// May break device id mapping and change current device. It's better to be
+/// called before other SYCLcompat or SYCL APIs.
+static inline void filter_device(const std::vector<std::string> &dev_subnames) {
+  detail::dev_mgr::instance().filter(dev_subnames);
+}
+
+/// List all the devices with its id in dev_mgr.
+static inline void list_devices() {
+  detail::dev_mgr::instance().list_devices();
+}
+
 static inline unsigned int select_device(unsigned int id) {
   detail::dev_mgr::instance().select_device(id);
   return id;
diff --git a/sycl/test-e2e/syclcompat/device/device.cpp b/sycl/test-e2e/syclcompat/device/device.cpp
index 9e4c8edcd91c9..0845859c5d55a 100644
--- a/sycl/test-e2e/syclcompat/device/device.cpp
+++ b/sycl/test-e2e/syclcompat/device/device.cpp
@@ -359,6 +359,24 @@ void test_max_nd_range() {
 #endif
 }
 
+void test_list_devices() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  DeviceTestsFixt dtf;
+
+  // Redirect std::cout to count new lines
+  CountingStream countingBuf(std::cout.rdbuf());
+  std::streambuf *orig_buf = std::cout.rdbuf();
+  std::cout.rdbuf(&countingBuf);
+
+  syclcompat::list_devices();
+
+  // Restore back std::cout
+  std::cout.rdbuf(orig_buf);
+
+  // Expected one line per device
+  assert(countingBuf.get_line_count() == dtf.get_n_devices());
+}
+
 int main() {
   test_at_least_one_device();
   test_matches_id();
@@ -377,6 +395,7 @@ int main() {
   test_version_parsing();
   test_image_max_attrs();
   test_max_nd_range();
+  test_list_devices();
 
   return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/device/device_filter.cpp b/sycl/test-e2e/syclcompat/device/device_filter.cpp
new file mode 100644
index 0000000000000..3f03432401b0a
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/device/device_filter.cpp
@@ -0,0 +1,78 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  device_filter.cpp
+ *
+ *  Description:
+ *    Device filtering tests
+ **************************************************************************/
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <syclcompat/device.hpp>
+
+void test_filtering_existing_device() {
+  auto &dev = syclcompat::get_current_device();
+  std::string dev_name = dev.get_info<sycl::info::device::name>();
+
+  syclcompat::filter_device({dev_name});
+  try {
+    syclcompat::get_device_id(dev);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Unexpected SYCL exception caught: " << e.what()
+              << std::endl;
+    assert(0);
+  }
+
+  // Checks for a substring of the device as well
+  std::string dev_substr = dev_name.substr(1, dev_name.find(" ") + 2);
+  syclcompat::filter_device({dev_substr});
+  try {
+    syclcompat::get_device_id(dev);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Unexpected SYCL exception caught: " << e.what()
+              << std::endl;
+    assert(0);
+  }
+}
+
+void test_filter_devices() {
+  auto &dev = syclcompat::get_current_device();
+
+  assert(syclcompat::detail::dev_mgr::instance().device_count() > 0);
+
+  syclcompat::filter_device({"NON-EXISTENT DEVICE"});
+  assert(syclcompat::detail::dev_mgr::instance().device_count() == 0);
+
+  try {
+    syclcompat::get_device_id(dev);
+    assert(0);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Expected SYCL exception caught: " << e.what() << std::endl;
+  }
+}
+
+int main() {
+  // syclcompat::dev_mgr is a singleton, so any changes to the device list is
+  // permanent between tests. Test isolated instead of relying on it being the
+  // last test in a different test suite.
+  test_filtering_existing_device();
+
+  test_filter_devices();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/device/device_fixt.hpp b/sycl/test-e2e/syclcompat/device/device_fixt.hpp
index 3a68eaf2317f1..ac0cc867a08d9 100644
--- a/sycl/test-e2e/syclcompat/device/device_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/device/device_fixt.hpp
@@ -50,3 +50,32 @@ class DeviceExtFixt {
 
   syclcompat::device_ext &get_dev_ext() { return dev_; }
 };
+
+// Helper for counting the output lines of syclcompat::list_devices
+// Used to override std::cout
+class CountingStream : public std::streambuf {
+public:
+  CountingStream(std::streambuf *buf) : buf(buf), line_count(0) {}
+
+  int overflow(int c) override {
+    if (c == '\n') {
+      ++line_count;
+    }
+    return buf->sputc(c);
+  }
+
+  std::streamsize xsputn(const char_type *s, std::streamsize count) override {
+    for (std::streamsize i = 0; i < count; ++i) {
+      if (s[i] == '\n') {
+        ++line_count;
+      }
+    }
+    return buf->sputn(s, count);
+  }
+
+  int get_line_count() const { return line_count; }
+
+private:
+  std::streambuf *buf;
+  int line_count;
+};

From 022433525b3572d66c7e5b0d35757144631a4f1c Mon Sep 17 00:00:00 2001
From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:59:28 +0100
Subject: [PATCH 22/55] [SYCL][COMPAT] Add wait_and_free plus rename async_free
 in syclcompat (#14015)

This PR adds a `wait_and_free` func. This makes it safer and less likely
to release memory during or before it is used by enqueued commands.

`async_free` is renamed `enqueue_free`, to make its behaviour clearer

This PR updates the comments and tests accordingly
---
 sycl/doc/syclcompat/README.md                 | 10 +++---
 sycl/include/syclcompat/device.hpp            |  5 +--
 sycl/include/syclcompat/memory.hpp            | 32 +++++++++++++------
 .../syclcompat/memory/memory_async.cpp        |  5 +--
 .../memory/memory_management_test3.cpp        | 26 +++++++++++++++
 5 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 3ffde6f224493..4bdcb93206f46 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -489,10 +489,12 @@ sycl::event memset_async(pitched_data pitch, int val,
                          sycl::range<3> size,
                          sycl::queue q = get_default_queue()); // 3D matrix
 
+// Free
+void wait_and_free(void *ptr, sycl::queue q = get_default_queue());
 void free(void *ptr, sycl::queue q = get_default_queue());
-sycl::event free_async(const std::vector<void *> &pointers,
-                       const std::vector<sycl::event> &events,
-                       sycl::queue q = get_default_queue());
+sycl::event enqueue_free(const std::vector<void *> &pointers,
+                         const std::vector<sycl::event> &events,
+                         sycl::queue q = get_default_queue());
 
 // Queries pointer allocation type
 class pointer_attributes {
@@ -870,7 +872,7 @@ independently of what is set in this parameter.
 Devices are managed through a helper class, `device_ext`. The `device_ext` class
 associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext`
 destructor waits on a set of `sycl::event` which can be added to via
-`add_event`. This is used, for example, to implement `syclcompat::free_async` to
+`add_event`. This is used, for example, to implement `syclcompat::enqueue_free` to
 schedule release of memory after a kernel or `mempcy`. SYCL device properties
 can be queried through `device_ext` as well.
 `device_ext` also provides the `has_capability_or_fail` member function, which
diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp
index 080fac3ef5275..e85785271ad76 100644
--- a/sycl/include/syclcompat/device.hpp
+++ b/sycl/include/syclcompat/device.hpp
@@ -664,8 +664,9 @@ Use 64 bits as memory_bus_width default value."
     std::lock_guard<std::mutex> lock(m_mutex);
     _events.push_back(event);
   }
-  friend sycl::event free_async(const std::vector<void *> &,
-                                const std::vector<sycl::event> &, sycl::queue);
+  friend sycl::event enqueue_free(const std::vector<void *> &,
+                                  const std::vector<sycl::event> &,
+                                  sycl::queue);
   queue_ptr _default_queue;
   queue_ptr _saved_queue;
   sycl::context _ctx;
diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp
index 5b578825b02ba..a8f7e89c52ab6 100644
--- a/sycl/include/syclcompat/memory.hpp
+++ b/sycl/include/syclcompat/memory.hpp
@@ -531,26 +531,40 @@ static inline void *malloc(size_t &pitch, size_t x, size_t y,
   return detail::malloc(pitch, x, y, 1, q);
 }
 
-/// free
+/// Wait on the queue \p q and free the memory \p ptr.
 /// \param ptr Point to free.
 /// \param q Queue to execute the free task.
 /// \returns no return value.
+static inline void wait_and_free(void *ptr,
+                                 sycl::queue q = get_default_queue()) {
+  get_current_device().queues_wait_and_throw();
+  q.wait();
+  if (ptr) {
+    sycl::free(ptr, q);
+  }
+}
+
+/// Free the memory \p ptr on the default queue without synchronizing
+/// \param ptr Point to free.
+/// \returns no return value.
 static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
   if (ptr) {
-    sycl::free(ptr, q.get_context());
+    sycl::free(ptr, q);
   }
 }
 
-/// Free the device memory pointed by a batch of pointers in \p pointers which
-/// are related to \p q after \p events completed.
+/// Enqueues the release of all pointers in /p pointers on the /p q.
+/// The command waits on all passed /p events and returns an event that
+/// track the commands execution on the queue.
 ///
 /// \param pointers The pointers point to the device memory requested to be
-/// freed. \param events The events to be waited. \param q The sycl::queue the
-/// memory relates to.
+/// freed.
+/// \param events The events to be waited on.
+/// \param q The sycl::queue the memory relates to.
 // Can't be static due to the friend declaration in the memory header.
-inline sycl::event free_async(const std::vector<void *> &pointers,
-                              const std::vector<sycl::event> &events,
-                              sycl::queue q = get_default_queue()) {
+inline sycl::event enqueue_free(const std::vector<void *> &pointers,
+                                const std::vector<sycl::event> &events,
+                                sycl::queue q = get_default_queue()) {
   auto event = q.submit(
       [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) {
         cgh.depends_on(events);
diff --git a/sycl/test-e2e/syclcompat/memory/memory_async.cpp b/sycl/test-e2e/syclcompat/memory/memory_async.cpp
index 3eb4123014497..b2ce1f9c8304f 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_async.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_async.cpp
@@ -43,14 +43,15 @@
 
 #include "memory_fixt.hpp"
 
-// free_async is a host task, so we are really testing the event dependency here
+// enqueue_free is just a host task, so we are really testing the event
+// dependency here
 void test_free_async() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
   AsyncTest atest;
 
   float *d_D = (float *)syclcompat::malloc(sizeof(float));
   sycl::event kernel_ev = atest.launch_kernel();
-  sycl::event free_ev = syclcompat::free_async({d_D}, {kernel_ev});
+  sycl::event free_ev = syclcompat::enqueue_free({d_D}, {kernel_ev});
 
   atest.check_events(kernel_ev, free_ev);
 }
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
index ee0c5fc146d59..24f56d3105ee5 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
@@ -65,6 +65,30 @@ void test_free_memory_q() {
   syclcompat::free(nullptr, q);
 }
 
+void test_wait_and_free_memory() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  sycl::queue q{{sycl::property::queue::in_order()}};
+  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
+  syclcompat::wait_and_free((void *)d_A);
+
+  syclcompat::wait_and_free(0);
+  syclcompat::wait_and_free(NULL);
+  syclcompat::wait_and_free(nullptr);
+}
+
+void test_wait_and_free_memory_q() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  sycl::queue q{{sycl::property::queue::in_order()}};
+  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
+  syclcompat::wait_and_free((void *)d_A, q);
+
+  syclcompat::wait_and_free(0, q);
+  syclcompat::wait_and_free(NULL, q);
+  syclcompat::wait_and_free(nullptr, q);
+}
+
 void test_memcpy_async() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
@@ -662,6 +686,8 @@ void test_constant_memcpy_async_q() {
 int main() {
   test_free_memory();
   test_free_memory_q();
+  test_wait_and_free_memory();
+  test_wait_and_free_memory_q();
   test_memcpy_async();
   test_memcpy_async_q();
   test_memcpy_async_pitched();

From 2de1435e57db805227c9e2cc5f3bfc53c4e1ef0b Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 6 Jun 2024 11:05:48 +0100
Subject: [PATCH 23/55] [SYCL][TEST-E2E] Disallow `dep_events.cpp` test built
 for CUDA backend to run on Windows (#13957)

[Windows doesn't support
cudaMemPrefetchAsync()](https://github.com/TimDettmers/bitsandbytes/issues/453)
which is used in the call to `prefetch` in the test.

[urEnqueueUSMPrefetch](https://github.com/oneapi-src/unified-runtime/blob/c0c607c3a88933b4c5c20a0aca4539781c678411/source/adapters/cuda/enqueue.cpp#L1629)
is also commented with a note for not having the support for CUDA on
Windows.
---
 sycl/test-e2e/USM/dep_events.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/USM/dep_events.cpp b/sycl/test-e2e/USM/dep_events.cpp
index 20ee05a309c7a..01f80564bc144 100644
--- a/sycl/test-e2e/USM/dep_events.cpp
+++ b/sycl/test-e2e/USM/dep_events.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-usm_shared_allocations
+// UNSUPPORTED: cuda && windows
 // RUN: %{build} -o %t1.out
 // RUN: %{run} %t1.out
 

From f4829ab15d53f9a6a7227591826aab98ce2c19bf Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 6 Jun 2024 13:05:21 +0100
Subject: [PATCH 24/55] [CI] pre-commit/aws pointed back to old image. (#14074)

temp fix for problems from cuda 12.5 uplift that were caused by
https://github.com/intel/llvm/pull/14049. Should fix
https://github.com/intel/llvm/issues/14071

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .github/workflows/sycl-linux-precommit-aws.yml | 2 +-
 .github/workflows/sycl-linux-precommit.yml     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
index f7fe4cad3ea96..4117b3465a3e3 100644
--- a/.github/workflows/sycl-linux-precommit-aws.yml
+++ b/.github/workflows/sycl-linux-precommit-aws.yml
@@ -64,7 +64,7 @@ jobs:
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]'
-      image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
       target_devices: ext_oneapi_cuda:gpu
       # No idea why but that seems to work and be in sync with the main
diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index f6e31541b7188..19d106fa23675 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -46,6 +46,7 @@ jobs:
       build_artifact_suffix: "default"
       build_cache_suffix: "default"
       changes: ${{ needs.detect_changes.outputs.filters }}
+      build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab"
 
   determine_arc_tests:
     name: Decide which Arc tests to run
@@ -77,7 +78,7 @@ jobs:
         include:
           - name: AMD/HIP
             runner: '["Linux", "amdgpu"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
             image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
             target_devices: ext_oneapi_hip:gpu
           - name: Intel

From 33ea75ef8e3f18cccc1bbe72fd48ada8ee32d81e Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 6 Jun 2024 14:22:02 +0100
Subject: [PATCH 25/55] [SYCL][TEST-E2E] Refactor the test to address Windows
 not printing the exception message (#14055)

- C++ thrown exception message not shown when running from Windows
terminal.
- The patch fixes
[cuda-max-local-mem-size.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp)
test failure.
---
 .../Plugin/cuda-max-local-mem-size.cpp        | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
index 5e31a461379ee..aefbbd99a685f 100644
--- a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
+++ b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
@@ -1,8 +1,8 @@
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
-// RUN: not %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
-// RUN: not %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
+// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
+// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
 
 //==---------------------- cuda-max-local-mem-size.cpp --------------------===//
 //==--- SYCL test to test SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE env var----------===//
@@ -16,15 +16,19 @@
 #include <sycl/detail/core.hpp>
 
 int main() {
-  sycl::queue Q{};
-  auto LocalSize =
-      Q.get_device().get_info<sycl::info::device::local_mem_size>();
-  Q.submit([&](sycl::handler &cgh) {
-     auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
-     cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
-       LocalAcc[idx.get_global_linear_id()] *= 2;
-     });
-   }).wait();
+  try {
+    sycl::queue Q{};
+    auto LocalSize =
+        Q.get_device().get_info<sycl::info::device::local_mem_size>();
+    Q.submit([&](sycl::handler &cgh) {
+       auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
+       cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
+         LocalAcc[idx.get_global_linear_id()] *= 2;
+       });
+     }).wait();
+  } catch (const std::exception &e) {
+    std::puts(e.what());
+  }
   // CHECK-ZERO: Local memory for kernel exceeds the amount requested using SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE
   // CHECK-OVERALLOCATE: Excessive allocation of local memory on the device
 }

From a12de3be91a03aea868c1f949d91cff2767d8097 Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 6 Jun 2024 14:23:34 +0100
Subject: [PATCH 26/55] [SYCL][TEST-E2E] Extend  `sycl-ls-gpu-default.cpp` test
 to cover Intel GPUs through OpenCL. (#14072)

Extend the `sycl-ls-gpu-default.cpp` test to cover the support of Intel
GPUs through OpenCL.
The patch fixes the failure when running the test on a system with Intel
and CUDA gpus.
---
 sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp b/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
index 373c95869ad02..298e12236e41e 100644
--- a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
+++ b/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
@@ -4,9 +4,9 @@
 // RUN: env --unset=SYCL_DEVICE_FILTER  --unset=ONEAPI_DEVICE_SELECTOR sycl-ls --verbose >%t.default.out
 // RUN: FileCheck %s --check-prefixes=CHECK-GPU-BUILTIN,CHECK-GPU-CUSTOM --input-file %t.default.out
 
-// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}gpu, {{.*}}{{Level-Zero|CUDA}}
+// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}gpu, {{.*}}{{Level-Zero|CUDA|OpenCL}}
 // clang-format off
-// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}gpu, {{.*}}{{Level-Zero|CUDA}}
+// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}gpu, {{.*}}{{Level-Zero|CUDA|OpenCL}}
 // clang-format on
 
 //==--------------------- sycl-ls-gpu-default-any.cpp ----------------------==//

From 463f00c8475ad23f8c301fdd06eb42b03d63a222 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 6 Jun 2024 09:29:44 -0700
Subject: [PATCH 27/55] [SYCL][E2E] Refactor/fix bfloat16 test (#14062)

Fixes ansi-alias violation and reads from uninitialized buffers. Fixes
https://github.com/intel/llvm/issues/13790.
---
 sycl/test-e2e/BFloat16/bfloat16_builtins.hpp | 379 +++++++------------
 1 file changed, 143 insertions(+), 236 deletions(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
index af08e8b5226e4..fa7a45fe2402c 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
@@ -16,8 +16,7 @@ constexpr float bf16_eps = 0.00390625;
 float make_fp32(uint16_t x) {
   uint32_t y = x;
   y = y << 16;
-  auto res = reinterpret_cast<float *>(&y);
-  return *res;
+  return sycl::bit_cast<float>(y);
 }
 
 bool check(float a, float b) {
@@ -26,217 +25,6 @@ bool check(float a, float b) {
 
 bool check(bool a, bool b) { return (a != b); }
 
-#define TEST_BUILTIN_1_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]}),   \
-                  sycl::NAME(ABF16))) {                                        \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ, RETTY)                               \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg;                                              \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg[i] = A[index][i];                                                \
-        }                                                                      \
-        marray<RETTY, SZ> res = NAME(arg);                                     \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16))) {                              \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_1(NAME, RETTY)                                            \
-  TEST_BUILTIN_1_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 1, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 2, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 3, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 4, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 5, RETTY)
-
-#define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<float> b_buf(&b[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        float BBF16 = float{bfloat16{B[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]},    \
-                                                        bfloat16{B[index]}),   \
-                  sycl::NAME(ABF16, BBF16))) {                                 \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ)                                      \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg0, arg1;                                       \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg0[i] = A[index][i];                                               \
-          arg1[i] = B[index][i];                                               \
-        }                                                                      \
-        marray<bfloat16, SZ> res =                                             \
-            sycl::ext::oneapi::experimental::NAME(arg0, arg1);                 \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          float BBF16 = float{bfloat16{B[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16, BBF16))) {                       \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_2(NAME)                                                   \
-  TEST_BUILTIN_2_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 1)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 2)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 3)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 4)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 5)
-
-#define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<float> b_buf(&b[0], N);                                             \
-    buffer<float> c_buf(&c[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> C(c_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        float BBF16 = float{bfloat16{B[index]}};                               \
-        float CBF16 = float{bfloat16{C[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]},    \
-                                                        bfloat16{B[index]},    \
-                                                        bfloat16{C[index]}),   \
-                  sycl::NAME(ABF16, BBF16, CBF16))) {                          \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ)                                      \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> c_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> C(c_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg0, arg1, arg2;                                 \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg0[i] = A[index][i];                                               \
-          arg1[i] = B[index][i];                                               \
-          arg2[i] = C[index][i];                                               \
-        }                                                                      \
-        marray<bfloat16, SZ> res =                                             \
-            sycl::ext::oneapi::experimental::NAME(arg0, arg1, arg2);           \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          float BBF16 = float{bfloat16{B[index][i]}};                          \
-          float CBF16 = float{bfloat16{C[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16, BBF16, CBF16))) {                \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_3(NAME)                                                   \
-  TEST_BUILTIN_3_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 1)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 2)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 3)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 4)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 5)
-
-#define TEST_BUILTIN_2_NAN(NAME)                                               \
-  {                                                                            \
-    buffer<int> err_buf(&err, 1);                                              \
-    buffer<float> nan_buf(&check_nan, 1);                                      \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      accessor<float, 1, access::mode::write, target::device> checkNAN(        \
-          nan_buf, cgh);                                                       \
-      cgh.single_task([=]() {                                                  \
-        checkNAN[0] = sycl::ext::oneapi::experimental::NAME(bfloat16{NAN},     \
-                                                            bfloat16{NAN});    \
-        if ((sycl::ext::oneapi::experimental::NAME(bfloat16{2},                \
-                                                   bfloat16{NAN}) != 2) ||     \
-            (sycl::ext::oneapi::experimental::NAME(bfloat16{NAN},              \
-                                                   bfloat16{2}) != 2)) {       \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);                                                            \
-  assert(std::isnan(check_nan));
-
 void test() {
   queue q;
 
@@ -249,18 +37,136 @@ void test() {
     c[i] = (float)(3 * i);
   }
 
-  TEST_BUILTIN_1(fabs, bfloat16);
-  TEST_BUILTIN_2(fmin);
-  TEST_BUILTIN_2(fmax);
-  TEST_BUILTIN_3(fma);
-
-  float check_nan = 0;
-  TEST_BUILTIN_2_NAN(fmin);
-  TEST_BUILTIN_2_NAN(fmax);
+  auto test = [&](auto ExpFunc, auto RefFunc, auto NumOperands) {
+    static_assert(NumOperands >= 1 && NumOperands <= 3);
+    {
+      buffer<float> a_buf(&a[0], N);
+      buffer<float> b_buf(&b[0], N);
+      buffer<float> c_buf(&c[0], N);
+      buffer<int> err_buf(&err, 1);
+      q.submit([&](handler &cgh) {
+        accessor A(a_buf, cgh);
+        accessor B(b_buf, cgh);
+        accessor C(c_buf, cgh);
+        accessor ERR(err_buf, cgh);
+        cgh.parallel_for(N, [=](id<1> index) {
+          auto ExpArg = [&](auto acc) { return bfloat16{acc[index]}; };
+          auto RefArg = [&](auto acc) { return float{bfloat16{acc[index]}}; };
+
+          bool failure = false;
+          if constexpr (NumOperands == 1) {
+            failure |= check(ExpFunc(ExpArg(A)), RefFunc(RefArg(A)));
+          } else if constexpr (NumOperands == 2) {
+            failure |= check(ExpFunc(ExpArg(A), ExpArg(B)),
+                             RefFunc(RefArg(A), RefArg(B)));
+          } else if constexpr (NumOperands == 3) {
+            failure |= check(ExpFunc(ExpArg(A), ExpArg(B), ExpArg(C)),
+                             RefFunc(RefArg(A), RefArg(B), RefArg(C)));
+          }
+
+          if (failure)
+            ERR[0] = 1;
+        });
+      });
+    }
+    assert(err == 0);
+
+    sycl::detail::loop<5>([&](auto SZ_MINUS_ONE) {
+      constexpr int SZ = SZ_MINUS_ONE + 1;
+      {
+        buffer<float, 2> a_buf{&a[0], range<2>{N / SZ, SZ}};
+        buffer<float, 2> b_buf{&b[0], range<2>{N / SZ, SZ}};
+        buffer<float, 2> c_buf{&c[0], range<2>{N / SZ, SZ}};
+        buffer<int> err_buf(&err, 1);
+        q.submit([&](handler &cgh) {
+          accessor A(a_buf, cgh);
+          accessor B(b_buf, cgh);
+          accessor C(c_buf, cgh);
+          accessor ERR(err_buf, cgh);
+          cgh.parallel_for(N / SZ, [=](id<1> index) {
+            marray<bfloat16, SZ> arg0, arg1, arg2;
+            for (int i = 0; i < SZ; i++) {
+              arg0[i] = A[index][i];
+              arg1[i] = B[index][i];
+              arg2[i] = C[index][i];
+            }
+            auto res = [&]() {
+              if constexpr (NumOperands == 1) {
+                return ExpFunc(arg0);
+              } else if constexpr (NumOperands == 2) {
+                return ExpFunc(arg0, arg1);
+              } else if constexpr (NumOperands == 3) {
+                return ExpFunc(arg0, arg1, arg2);
+              }
+            }();
+
+            bool failure = false;
+            for (int i = 0; i < SZ; ++i) {
+              auto RefArg = [&](auto acc) {
+                return float{bfloat16{acc[index][i]}};
+              };
+              if constexpr (NumOperands == 1) {
+                failure |= check(res[i], RefFunc(RefArg(A)));
+              } else if constexpr (NumOperands == 2) {
+                failure |= check(res[i], RefFunc(RefArg(A), RefArg(B)));
+              } else if constexpr (NumOperands == 3) {
+                failure |=
+                    check(res[i], RefFunc(RefArg(A), RefArg(B), RefArg(C)));
+              }
+            }
+            if (failure)
+              ERR[0] = 1;
+          });
+        });
+      }
+      assert(err == 0);
+    });
+  };
+
+#define TEST(NAME, NUM_OPERANDS)                                               \
+  test(                                                                        \
+      [](auto... args) {                                                       \
+        return sycl::ext::oneapi::experimental::NAME(args...);                 \
+      },                                                                       \
+      [](auto... args) { return sycl::NAME(args...); },                        \
+      std::integral_constant<int, NUM_OPERANDS>{})
+
+  TEST(fabs, 1);
+
+  TEST(fmin, 2);
+  TEST(fmax, 2);
+  TEST(fma, 3);
+
+  auto test_nan = [&](auto ExpFunc) {
+    float check_nan = 0;
+    {
+      buffer<int> err_buf(&err, 1);
+      buffer<float> nan_buf(&check_nan, 1);
+      q.submit([&](handler &cgh) {
+        accessor ERR(err_buf, cgh);
+        accessor checkNAN(nan_buf, cgh);
+        cgh.single_task([=]() {
+          checkNAN[0] = ExpFunc(bfloat16{NAN}, bfloat16{NAN});
+          if ((ExpFunc(bfloat16{2}, bfloat16{NAN}) != 2) ||
+              (ExpFunc(bfloat16{NAN}, bfloat16{2}) != 2)) {
+            ERR[0] = 1;
+          }
+        });
+      });
+    }
+    assert(err == 0);
+    assert(std::isnan(check_nan));
+  };
+  test_nan([](auto... args) {
+    return sycl::ext::oneapi::experimental::fmin(args...);
+  });
+  test_nan([](auto... args) {
+    return sycl::ext::oneapi::experimental::fmax(args...);
+  });
 
   // Insert NAN value in a to test isnan
   a[0] = a[N - 1] = NAN;
-  TEST_BUILTIN_1(isnan, bool);
+  TEST(isnan, 1);
 
   // Orignal input 'a[0...N-1]' are in range [-0.5, 0.5),
   // need to update it for generic math testing.
@@ -270,25 +176,26 @@ void test() {
     if ((i & 0x1) == 0x1)
       a[i] = -a[i];
   }
-  TEST_BUILTIN_1(cos, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(sin, sycl::ext::oneapi::bfloat16);
+  TEST(cos, 1);
+  TEST(sin, 1);
 
   // ceil, floor, trunc, exp, exp2, exp10, rint testing
-  TEST_BUILTIN_1(ceil, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(floor, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(trunc, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp10, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp2, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(rint, sycl::ext::oneapi::bfloat16);
+  TEST(ceil, 1);
+  TEST(floor, 1);
+  TEST(trunc, 1);
+  TEST(exp, 1);
+  TEST(exp10, 1);
+  TEST(exp2, 1);
+  TEST(rint, 1);
 
   // log, log2, log10, sqrt, rsqrt testing, the input
   // must be positive.
   for (int i = 0; i < N; ++i)
     a[i] = a[i] + 8.5;
-  TEST_BUILTIN_1(sqrt, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(rsqrt, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log2, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log10, sycl::ext::oneapi::bfloat16);
+
+  TEST(sqrt, 1);
+  TEST(rsqrt, 1);
+  TEST(log, 1);
+  TEST(log2, 1);
+  TEST(log10, 1);
 }

From 0f796bcde4d1815ff990388d0f0112379b9739b0 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 6 Jun 2024 09:33:52 -0700
Subject: [PATCH 28/55] [SYCL][E2E] Disable
 NonUniformGroups/ballot_group_algorithms.cpp on CUDA (#14058)

Fails in Nightly testing on the self-hosted CUDA runner:
https://github.com/intel/llvm/issues/12995.
---
 sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
index 8d1d17df461df..3f7ae71566a2e 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -5,6 +5,10 @@
 // REQUIRES: sg-32
 // REQUIRES: aspect-ext_oneapi_ballot_group
 
+// Fails in Nightly testing on the self-hosted CUDA runner:
+// https://github.com/intel/llvm/issues/12995.
+// UNSUPPORTED: cuda
+
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/ballot_group.hpp>
 #include <sycl/group_algorithm.hpp>

From 0cbc9a07a0201109ee8041d69e6ef9833af706eb Mon Sep 17 00:00:00 2001
From: David <david.garcia.orozco@intel.com>
Date: Thu, 6 Jun 2024 13:08:44 -0600
Subject: [PATCH 29/55] [SYCL][E2E] Remove warnings in Basic e2e tests (#13994)

---
 sycl/test-e2e/Basic/built-ins/host_math.cpp   |  9 +++++---
 sycl/test-e2e/Basic/device_event.cpp          |  5 ++--
 sycl/test-e2e/Basic/event.cpp                 |  4 +---
 sycl/test-e2e/Basic/host-task-dependency.cpp  | 23 ++++++++-----------
 .../Basic/sycl_2020_images/common.hpp         |  2 +-
 5 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/sycl/test-e2e/Basic/built-ins/host_math.cpp b/sycl/test-e2e/Basic/built-ins/host_math.cpp
index 6057c9a5f2734..739bf79240e0d 100644
--- a/sycl/test-e2e/Basic/built-ins/host_math.cpp
+++ b/sycl/test-e2e/Basic/built-ins/host_math.cpp
@@ -44,7 +44,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         86.0f, 10.0f,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == 9);
     assert(rem == -4);
   }
@@ -53,7 +54,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         -10.0, 3.0,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == -3);
     assert(rem == -1);
   }
@@ -62,7 +64,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         0.552879f, 0.219282f,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == 3);
     assert(rem == -0.10496702790260315f);
   }
diff --git a/sycl/test-e2e/Basic/device_event.cpp b/sycl/test-e2e/Basic/device_event.cpp
index b9c5354a74cdf..25631fac20843 100644
--- a/sycl/test-e2e/Basic/device_event.cpp
+++ b/sycl/test-e2e/Basic/device_event.cpp
@@ -57,8 +57,7 @@ int test_strideN(size_t stride) {
     nElemsToCopy++;
 
   try {
-    default_selector selector;
-    queue myQueue(selector, [](exception_list l) {
+    queue myQueue(default_selector_v, [](exception_list l) {
       for (auto ep : l) {
         try {
           std::rethrow_exception(ep);
@@ -88,7 +87,7 @@ int test_strideN(size_t stride) {
             local_acc.get_multi_ptr<access::decorated::yes>();
         decorated_global_ptr<int> gptr =
             out_ptr.get_multi_ptr<access::decorated::yes>() +
-            grp.get_id()[0] * 16;
+            grp.get_group_id()[0] * 16;
 
         // Write the values 700, 701, ..., 763 to global memory.
         // Why? Well, a) to ensure that something is written into that memory
diff --git a/sycl/test-e2e/Basic/event.cpp b/sycl/test-e2e/Basic/event.cpp
index aec4dbbedd99d..d5cba1063f074 100644
--- a/sycl/test-e2e/Basic/event.cpp
+++ b/sycl/test-e2e/Basic/event.cpp
@@ -52,15 +52,13 @@ int main() {
   }
 
   {
-    struct exception : public sycl::exception {};
-
     std::cout << "wait_and_throw() check" << std::endl;
     bool failed = true;
     auto handler = [&](sycl::exception_list l) { failed = false; };
 
     sycl::queue queue(handler);
     sycl::event e = queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=]() { throw exception{}; });
+      cgh.host_task([=]() { throw sycl::exception{sycl::errc::runtime}; });
     });
     e.wait_and_throw();
     assert(failed == false);
diff --git a/sycl/test-e2e/Basic/host-task-dependency.cpp b/sycl/test-e2e/Basic/host-task-dependency.cpp
index 367f8def7fe8a..7f4f31320f1e7 100644
--- a/sycl/test-e2e/Basic/host-task-dependency.cpp
+++ b/sycl/test-e2e/Basic/host-task-dependency.cpp
@@ -32,10 +32,10 @@ struct Context {
 
 S::event HostTask_CopyBuf1ToBuf2(Context *Ctx) {
   S::event Event = Ctx->Queue.submit([&](S::handler &CGH) {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        CopierSrcAcc(Ctx->Buf1, CGH);
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        CopierDstAcc(Ctx->Buf2, CGH);
+    S::host_accessor<int, 1, S::access::mode::read> CopierSrcAcc(Ctx->Buf1,
+                                                                 CGH);
+    S::host_accessor<int, 1, S::access::mode::write> CopierDstAcc(Ctx->Buf2,
+                                                                  CGH);
 
     auto CopierHostTask = [=] {
       for (size_t Idx = 0; Idx < CopierDstAcc.size(); ++Idx)
@@ -59,24 +59,21 @@ S::event HostTask_CopyBuf1ToBuf2(Context *Ctx) {
 void Thread1Fn(Context *Ctx) {
   // 0. initialize resulting buffer with apriori wrong result
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf1);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf1);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -1;
   }
 
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf2);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf2);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -2;
   }
 
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf3);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf3);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -3;
@@ -117,8 +114,7 @@ void Thread1Fn(Context *Ctx) {
 
   // 4. check data in buffer #3
   {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        Acc(Ctx->Buf3);
+    S::host_accessor<int, 1, S::access::mode::read> Acc(Ctx->Buf3);
 
     bool Failure = false;
 
@@ -163,8 +159,7 @@ void test() {
 
   // 3. check via host accessor that buf 2 contains valid data
   {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        ResultAcc(Ctx.Buf2);
+    S::host_accessor<int, 1, S::access::mode::read> ResultAcc(Ctx.Buf2);
 
     bool Failure = false;
     for (size_t Idx = 0; Idx < ResultAcc.size(); ++Idx) {
diff --git a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
index e03a1664e1f92..b5b97a66d2f2f 100644
--- a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
+++ b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
@@ -269,7 +269,7 @@ template <typename T, int Dims> bool AllTrue(const vec<T, Dims> &Vec) {
 
 template <typename T, int Dims>
 bool ApproxEq(const vec<T, Dims> &LHS, const vec<T, Dims> &RHS,
-              T Precision = 0.1) {
+              T Precision = (T)0.1) {
   if constexpr (std::is_integral_v<T>)
     return AllTrue(sycl::abs(LHS - RHS) <= Precision);
   else

From b8693eb8b1c478c505660d1cf92ff17b67510453 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Thu, 6 Jun 2024 13:04:53 -0700
Subject: [PATCH 30/55] [Doc] Add Mar'24 Release Notes (#13879)

---
 sycl/ReleaseNotes.md | 150 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/sycl/ReleaseNotes.md b/sycl/ReleaseNotes.md
index b80e6640b9d18..bb592c570db92 100644
--- a/sycl/ReleaseNotes.md
+++ b/sycl/ReleaseNotes.md
@@ -1,3 +1,153 @@
+# Mar'24 release notes
+Release notes for commit range [f4e0d3177338](https://github.com/intel/llvm/commit/f4ed132f243ab43816ebe826669d978139964df2).. [d2817d6d317db1](https://github.com/intel/llvm/commit/d2817d6d317db1143bb227168e85c409d5ab7c82)
+
+## New Features
+### SYCL Compiler
+
+- Added more available CPU for `-march` option in OpenCL AOT compiler. [7911773c]
+- Added support for additional AMD GPU targets. [c1ce15944]
+- Supported detecting out-of-bound errors on CPU device, static local memory, and device globals via AddressSanitizer. [f331ba2063] [a14cfdd7999]
+- Provide a preprocessor macro to locate the CUPTI library when XPTI tracing is enabled during compiler build. [e15ebd08] [acf89a6c90]
+- Made `-fsycl-dump-device-code` save PTX files generated for the CUDA backend. [16e06ff]
+- When multiple floating point accuracy-related options are specified on the CLI, made the last option take precedence over others. [69e2b91]
+- Added a new `-fsycl-dump-device-code` option to dump device code generated during SYCL compilation into a user-specified directory. [96ce6ea]
+- Added support for `-fsycl-link` with ahead-of-time (AOT) compilation. [22fab5a]
+- Added support for `-O3` on Windows when using `clang-cl`. [0af4ac7]
+
+### SYCL Library
+
+- Implemented [ext_oneapi_kernel_compiler](https://github.com/intel/llvm/blob/096676e8d4d87475860723ed8a4d8c256bcd98c2/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc) SYCL extension.  [096676e8] [e5826540] [67086100]
+- Implemented [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) SYCL extension. [bf8ea96f]
+- Implemented [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension. [6344ead19]
+- Enabled kernel fusion with heterogeneous ND ranges for HIP targets. [e44888873]
+- Enabled [ext_oneapi_graph](https://github.com/intel/llvm/blob/5d7524543/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension for OpenCL and HIP backend. [5d7524543] [897b27076]
+- Supported graph partitioning for host task dependencies in [ext_oneapi_graph](https://github.com/intel/llvm/blob/d53f123a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [d53f123a]
+- Added ESIMD APIs for stochastic rounding, property-based gather, masked-gather, and ReaD timestamp counting. [aa4e87801] [3eca2d473] [1261e0518]
+- Added out-of-bounds `load`,`store`,`fill` and overloads accepting annotated pointers in [ext_oneapi_matrix](https://github.com/intel/llvm/blob/4c17a7f39/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension [4c17a7f39] [f3137e99]
+- Added support for `queue::mem_advise` on HIP backends. [a669374b7] [ab86d0db]
+- Supported `fill` and `memset` nodes in [ext_oneapi_graph](https://github.com/intel/llvm/blob/8ea022954/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [8ea022954]
+- Implemented [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension. [19072756e]
+- Implemented [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/123705190/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension. [123705190]
+- Implemented [ext_oneapi_kernel_compiler_spirv](https://github.com/intel/llvm/blob/36e123d3e1/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc) SYCL extension. [36e123d3e1]
+- Implemented [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/2db1a4f6a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension. [2db1a4f6a5]
+- Implemented joint matrix query from [ext_oneapi_matrix](https://github.com/intel/llvm/blob/00eebe1e4/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension on CUDA and HIP backends. [00eebe1e4]
+- Added support for unsampled image arrays in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/76ec3f0f7/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [76ec3f0f7]
+- Added `__imf_rcp64h` - equivalent to CUDA's `__nv_rcp64h` - and `sqrt` function with selectable rounding modes to Intel math libdevice. [ce70cb521] [6c1dde4243b5]
+- Integrated OneAPI construction kit's vectorizer to Native CPU backend. [330ac57d6]
+- Added ability to compare device architecture and support for PVC-VG to [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/68445467/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [68445467] [ac0e142e12]
+- Added `sycl::length` wrapper and a helper functions in SYCLCompat library for occupancy calculation in Intel GPUs. [b209b321] [2525570]
+- Added support for SYCL barriers on Native CPU. [3c39d132a]
+- Added support for `bfloat16` to `sycl::vec`. [bbbe8839]
+- Added vectorized binary and unary operations through callable structs in the SYCLCompat library. [5505e03]
+- Supported profiling information for default-constructed events when `ext_oneapi_barrier` is submitted to an empty in-order queue. [200694b]
+- Implemented `ext_oneapi_private_alloca` by adding code generation capabilities for `private_alloca`. [f4e0d31]
+- Added support for memory attributes on `non-const` device global variables on FPGA. [3bb5f40] [3fc6708]
+- Added `set_default_queue` functionality to SYCLCompat library to enable changing the default queue of the current device. [e72b85c]
+- Propagate annotations from `annotated_ptr` to the underlying raw pointers to enable additional optimization opportunities. [8f182cd]
+
+### Documentation
+- Proposed [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f4/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) extension to allow specifying the rounding and denorm mode for floating-point operations in SYCL kernels. [bf8ea96f4]
+- Proposed [ext_oneapi_raw_kernel_arg](https://github.com/intel/llvm/blob/4168793978/sycl/doc/extensions/proposed/sycl_ext_oneapi_raw_kernel_arg.asciidoc) SYCL extension to allow opaque types to be passed to SYCL kernels. [4168793978]
+- Proposed [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/9a1b9084/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension to allow card-level device access on PVC GPUs. [9a1b9084]
+- Proposed [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension to allow getting event from the last submitted command and setting an external event as an implicit dependence on the next command submitted to the queue [19072756e]
+- Proposed [ext_oneapi_profiling_tag](https://github.com/intel/llvm/blob/b4ade420/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc) SYCL extension to time commands submitted to the queue. [b4ade420]
+- Proposed [ext_oneapi_private_alloca](https://github.com/intel/llvm/blob/aaf7a58863/sycl/doc/extensions/experimental/sycl_ext_oneapi_private_alloca.asciidoc) SYCL extension to have specialization constant-length private memory allocations. [aaf7a58863]
+- Added `joint_matrix_prefetch` and overloads of load and store with `annotated_ptr` in [ext_intel_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc) and [ext_oneapi_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extensions. [04a222f]
+
+### Other changes
+- Created an additional version-agnostic copy of the SYCL import library during compiler build. [2d2e418c]
+
+## Improvements
+### SYCL Compiler
+- Enabled default selection of general register file (GRF) size on Linux for PVC GPUs. [8083f8a8]
+- Disabled passing `-sycl-opt` for NativeCPU to enable the original full LLVM optimization pipeline. [3fe77b9]
+- Enabled `-fsycl-esimd-force-stateless-mem` flag by default. [f316273]
+- Enable `-emit-only-kernels-as-entry-point` by default on Intel backends for `sycl-post-link` to prevent device code bloating. [70fddbb]
+
+
+### SYCL Library
+- Improved error messages for invalid properties specified on non pointer types. [728b132a5]
+- Adopted a unified and scalable way to pass alignment and cache flags to all ESIMD functions. [a2208484ab] [960d898c] [5ef8df837d] [a57a96c77] [19cd6144a] [646ab086e5] [0bf2e666c]
+- Added default constructor to bindless sampler and image handler in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/d65f3aa560/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [d65f3aa560] [7bfdcfd4cabf]
+- Added `SYCL_CACHE_IN_MEM` environment variable to disable in-memory caching of programs and facilitated automatic program cache cleaning when running out of memory. [9322d14ce] [6cf1ae081ac]
+- Improved templated and convertible builtins after clarification in SYCL 2020 revision 8. [92861835]
+- Allowed generic_space `multi_ptr` in math builtins. [eda8a587f1]
+- Improved error message when writing beyond the bounds of `simd_view` object. [197c33a2b]
+- Optimized `ext_oneapi_submit_barrier` from [ext_oneapi_enqueue_barrier](https://github.com/intel/llvm/blob/7e08c15dd/sycl/doc/extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) into `NOP` for in-order queues with empty waitlist. [7e08c15dd]
+- Supported prefetch, memory advise, and automatic management of dependencies for multiple command-buffer submissions in [ext_oneapi_graph](https://github.com/intel/llvm/blob/c6fbac59/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [c6fbac59] [56f8d38c]
+- Added support for profiling command buffers. [b04f894dbd06b]
+- Implemented ESIMD APIs that accepts compile-time properties. [655ab100] [5582ce4db] [d286f4ab1c] [961793913] [0cfe7e35] [656b8be7]
+- Removed deprecated esimd_emulators from device filters and depreciated `SYCL_DEVICE_FILTER` in favor of `ONEAPI_DEVICE_SELECTOR`. [9d0888ca3] [8d0fa9875]
+- Improved error message when trying to fuse kernels with incompatible ND-Ranges in [ext_codeplay_kernel_fusion](https://github.com/intel/llvm/blob/7d492f87ec97/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc). [7d492f87ec97]
+- Made user functions to always inline in the SYCL kernels to reduce overhead in SYCLCompat library. [e121c8811]
+- Made runtime choose device image with inlined specialization constant when `-fsycl-add-default-spec-consts-image` option is used. [73d34739b]
+- Made `nd_item` stateless to reduce initialization overhead. [7999e27b]
+- Improved warning messages and added `-ignore-device-selector` flag to `sycl-ls` to ignore device selection environment variables. [6e3aa218]
+- Improved error handling when calling `matrix_combinations` query on platforms unsupported by [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/c00305b73/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [c00305b73]
+- Made default `sycl::queue` context reusable on Windows. [491e6e4ea]
+- Changed default cache hints for `prefetch` ESIMD API. [984c88c]
+- Limited `bfloat16` ESIMD operations to data types convertible to `float`, as required by the SPEC. [f81b5a2]
+- Removed the implicitly passed `-ze-take-global-address` IGC option as it is by default enabled on newer IGC versions. [7e414a9]
+- Improved product security by ensuring that `pi_win_proxy_loader.dll` is loaded only from trusted directories. [85b7145] [218d9fe] [9c504a5]
+- Aligned `sycl-ls` output with `ONEAPI_DEVICE_SELECTOR` environment variable syntax. [38ce764] [f720291]
+- Improved error message when kernel compilation fails. [eba7b7e]
+
+
+### Documentation
+- Updated [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19e/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension to allow querying OpenCL version. [6344ead19e]
+- Updated [ext_intel_data_flow_pipes_properties](https://github.com/intel/llvm/blob/2a0911892/sycl/doc/extensions/experimental/sycl_ext_intel_data_flow_pipes_properties.asciidoc) to include AXI streaming as a protocol choice on FPGAs. [2a0911892]
+- Updated [KernelFusionJIT](https://github.com/intel/llvm/blob/b9854a12/sycl/doc/design/KernelFusionJIT.md) to include details on local/private memory allocation size, different promotion hints, etc. [b9854a12]
+- Updated [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/b0f584c675f9/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) to make external events wait when queue is waited on. [b0f584c675f9]
+- Improved [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/84a92e03/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension to allow casting raw pointers to multi_ptr. [84a92e03]
+
+## Bug Fixes
+### SYCL Compiler
+- Made the device binary generated by `-fsycl-link=image` linkable by adding more information into the binary. [219d4ef54]
+- Fixed linking error when separately compiling and linking a SYCL program with SYCL libraries. [d6eecfa]
+- Fixed `clangd` parsing crash with `-fsycl` flag when using `!nullptr` asserts. [f42bbcc]
+
+### SYCL Library
+- Fixed computation of submit time based on host timestamps. [254756369c]
+- Fixed SYCL CTS failures for Unified Runtime's OpenCL adapter. [4c0780e76]
+- Fixed strict aliasing violations in `sycl::vec` routines. [a9d0e1b8]
+- Fixed logical operations and integer conversions among sycl::vec types. [3d5e41fddf] [ff48612f] [7868596d]
+- Fixed compound operators on `annoted_ptr` when the user-defined type only defines a compound operator. [c43a90f2]
+- Fixed exponential slowdown in multiple calls to `queue::ext_oneapi_submit_barrier`. [079fc97b]
+- Fixed input handling for `ONEAPI_DEVICE_SELECTOR` environment variable. [90b6aee46]
+- Fixed in-order dependency filtering for isolated kernels. [8e7995df]
+- Fixed double-free bug in kernel-program cache. [04ff5b81]
+- Fixed resource leak in `SYCL_FALLBACK_ASSERT`. [b478d2fa]
+- Fixed deadlock in in-order queue when submitting a host task and simultaneously accessing stream service events. [3031733]
+- Made `sycl::vec` interface consistent with `sycl::marray` and `sycl::buffer` by defining `value_type` alias. [33e5b10]
+- Fix handling of enumeration specialization constants. [1f0dc36]
+- Fixes `-O0 -fno-inline-functions` ESIMD failures by inlining some non-inline functions due to VC limitations. [89327e0]
+
+### Documentation
+- Clarified [ext_oneapi_graph](https://github.com/intel/llvm/blob/2581123a1/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension to make it illegal for graph nodes to depend on events from outside the graph. [2581123a1]
+- Updated [ext_oneapi_non_uniform_groups](https://github.com/intel/llvm/blob/90a55a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_non_uniform_groups.asciidoc) to invert group numbering for ballot groups. [90a55a5]
+- Updated [ext_oneapi_free_function_kernels](https://github.com/intel/llvm/blob/a452e06a0ebcbabbfecbeb2ca05675265bddbf8d/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc) to remove `range kernels` from the extension. [a452e06]
+
+## Known Issues
+- On Windows, the Unified Runtime's Level Zero leak check does not work correctly with
+the default contexts on Windows. This is because on Windows the release
+of the plugin DLLs races against the release of static global variables
+(like the default context).
+- Intel Graphic Compiler's Vector Compute backend does not support O0 code and often gets miscompiled, produces wrong answers and crashes. This issue directly affects ESIMD code at O0. As a temporary workaround, we have optimize ESIMD code even in O0 mode. [00749b1e8](https://github.com/intel/llvm/commit/00749b1e8e3085acfdc63108f073a255842533e2)
+- `multi_ptr` relational operators assume the lowest possible value of `std::null_ptr` which might cause issues with the CUDA and AMDGPU backends. This will be fixed in the next release. ([13201](https://github.com/intel/llvm/pull/13201))
+- When `-fsycl-device-code-split=off` is set, having kernels with different `reqd_work_group_size` attributes could lead to runtime errors about local size mismatching the attribute value. The issue is also reproducible when there is a kernel with `reqd_work_group_size` attribute, but other kernels don't have that attribute set. This will be fixed in the next release. ([#13523](https://github.com/intel/llvm/pull/13523))
+- Having default-constructed `local_accessor` as unused kernel argument could lead to runtime errors during kernel arguments setting. The issue is reproducible when optimizations are explicitly disabled through `-O0`, or when optimizations failed to remove that unused kernel argument. This will be fixed in the next release. ([#13382](https://github.com/intel/llvm/pull/13382))
+- ONEAPI_DEVICE_SELECTOR incorrectly parses `!` from discard filters. This will be fixed in the next release. ([SYCL] Fix ONEAPI_DEVICE_SELECTOR handling of discard filters. #13927)
+
+## API/ABI breaking changes
+- Renamed and removed some APIs from [ext_oneapi_free_function_queries](https://github.com/intel/llvm/commit/287fd3733#diff-4ab48d4a7f26c356939d42c6aed9c67d4d59aafac11565f3bfe71d7e053a4db4) SYCL extension. [287fd3733]
+
+## Upcoming API/ABI breakages
+The following changes ared only in effect if the `-fpreview-breaking-changes` flag is set.
+- Changed return type of `abs_diff` to be same as that of the input. [2a3e1ab82]
+- Added a preview of pre-C++11 ABI support for GCC on Linux.  This feature  allows users to set a GCC compiler flag -D_GLIBCXX_USE_CXX11_ABI=0 to use pre-C++11 ABI. Details about GCC C++11 ABI is available at https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html. In this release, this feature is enabled under the flag -fpreview-breaking-changes, and the support is incomplete and may not work for some cases.  [459e122a]
+- Removed some sub-group class APIs that do not appear in SYCL 2020 Spec. [2985395]
+
+
 # Nov'23 release notes
 Release notes for commit range f4e0d3177338..f4ed132f243a
 

From d66106cc7d3c62b5f6ceea2bd7140adb0735019d Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:38:59 -0700
Subject: [PATCH 31/55] [ESIMD] Allow full autodeduction of template parameters
 for atomic_update ACC API accepting simd_view (#14065)

Co-authored-by: Nick Sarnie <sarnex@users.noreply.github.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 863 +++++++++++++++++-
 .../esimd/memory_properties_atomic_update.cpp | 136 ++-
 2 files changed, 957 insertions(+), 42 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 904dcfd8d7a45..9619110600899 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -10121,14 +10121,15 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, props = {});                  // (acc-au1-2)
-///
-/// A variation of \c atomic_update API with no mask operand.
+///               SrcSimdViewT src0, simd_mask<N> mask, props = {});
 ///
 /// Atomically updates \c N memory locations represented by an accessor and
 /// a vector of offsets, and returns a vector of old values found at the
 /// memory locations before update. The update operation has 1 additional
 /// argument.
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 /// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
@@ -10136,14 +10137,66 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 /// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
 /// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
 /// atomic_op::fsub, \c atomic_op::store.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
 /// offsets are supported only when stateless memory accesses are enforced, i.e.
 /// accessor based accesses are automatically converted to stateless accesses.
 /// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+
+template <
+    atomic_op Op, typename SrcSimdViewT, typename Toffset,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, props = {});                  // (acc-au1-2)
+///
+/// A variation of \c atomic_update API with no mask operand.
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used. Other properties are
 ///   ignored.
@@ -10164,6 +10217,57 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
   return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(AccessorT acc, SrcSimdViewT byte_offset,
+///               simd<T, N> src0, props = {});
+///
+/// A variation of \c atomic_update API with no mask operand and \c src0
+/// represented as \c simd_view object that allows the use without specifying
+/// \c T and \c N template parameters.
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename Toffset,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), props);
+}
+
 /// simd<T, N>
 /// atomic_update(AccessorT acc,
 ///               OffsetSimdViewT byte_offset,
@@ -10211,6 +10315,59 @@ atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
   return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object that allows the use without specifying
+/// \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), mask,
+                                 props);
+}
+
 /// simd<T, N>
 /// atomic_update(AccessorT acc,
 ///               OffsetSimdViewT byte_offset,
@@ -10257,6 +10414,56 @@ atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
   return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and no \c mask operand that allows the
+/// use without specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), props);
+}
+
 /// A variation of \c atomic_update API with \c offset represented as
 /// scalar object.
 ///
@@ -10414,19 +10621,30 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {});                                   // (acc-au2-2)
+///               SrcSimdViewT src0, simd<T, N> src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// A variation of \c atomic_update API with no mask operand.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
@@ -10434,33 +10652,46 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              simd<T, N> src1, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorTy acc, OffsetSimdViewT
-///               byte_offset, simd<T, N> src0, simd<T, N> src1,
-///               simd_mask<N> mask, props = {});              // (acc-au2-3)
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// a \c simd_view object.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
@@ -10470,33 +10701,457 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
+///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), mask,
                                  props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorTy acc,
-///               OffsetSimdViewT, byte_offset,
-///               simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// a \c simd_view object and no mask operand.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and src1 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
+                                 mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {});                                   // (acc-au2-2)
+///
+/// A variation of \c atomic_update API with no mask operand.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+//                props = {});
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand and with \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+//                props = {});
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand with \c src1
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+//                props = {});
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand with \c src0 and
+/// \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and src1 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});              // (acc-au2-3)
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// a \c simd_view object.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
+                                 mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src1
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
+                                 mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset, \c src0 and
+/// \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must correspond "
+                "to the size of "
+                "mask parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
+                                 src1.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc,
+///               OffsetSimdViewT, byte_offset,
+///               simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// a \c simd_view object and no mask operand.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
@@ -10524,6 +11179,138 @@ atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
                                  props);
 }
 
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, simd<T, N> src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with with no mask operand and \c
+/// byte_offset and \c src0 represented as \c simd_view object and allows the
+/// use without specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, SrcSimdViewT src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with no mask operand and \c byte_offset
+/// and \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with no mask operand and \c byte_offset,
+/// \c src0 and \c src1 represented as \c simd_view object and allows the use
+/// without specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0, src1 and byte_offset parameters must correspond.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
+                                 src1.read(), props);
+}
+
 /// A variation of \c atomic_update API with \c offsets represented as
 /// scalar.
 ///
diff --git a/sycl/test/esimd/memory_properties_atomic_update.cpp b/sycl/test/esimd/memory_properties_atomic_update.cpp
index a71d64ef984d5..aecf71759a3b3 100644
--- a/sycl/test/esimd/memory_properties_atomic_update.cpp
+++ b/sycl/test/esimd/memory_properties_atomic_update.cpp
@@ -267,8 +267,8 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
     // Accessors
 
-    // CHECK-STATEFUL-COUNT-14:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
-    // CHECK-STATELESS-COUNT-14: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-STATEFUL-COUNT-26:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS-COUNT-26: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_9 =
         atomic_update<atomic_op::add, int>(acc, offsets, add, pred, props_a);
 
@@ -311,6 +311,42 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_atomic_16 = atomic_update<atomic_op::add, int, VL>(
         acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
 
+    res_atomic_11 =
+        atomic_update<atomic_op::add>(acc, offsets, add_view, pred, props_a);
+
+    res_atomic_11 = atomic_update<atomic_op::add>(
+        acc, offsets, add_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_12 =
+        atomic_update<atomic_op::add>(acc, offsets, add_view, props_a);
+
+    res_atomic_12 = atomic_update<atomic_op::add>(
+        acc, offsets, add_view.select<VL, 1>(), props_a);
+
+    res_atomic_13 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add, pred, props_a);
+
+    res_atomic_13 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add, pred, props_a);
+
+    res_atomic_14 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add, props_a);
+    res_atomic_14 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add, props_a);
+
+    res_atomic_15 = atomic_update<atomic_op::add>(acc, offsets_view, add_view,
+                                                  pred, props_a);
+
+    res_atomic_15 =
+        atomic_update<atomic_op::add>(acc, offsets_view.select<VL, 1>(),
+                                      add_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_16 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add_view, props_a);
+
+    res_atomic_16 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
+
     // atomic_update without cache hints:
     // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.dword.atomic.sub.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
@@ -489,8 +525,8 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
     // Accessors
 
-    // CHECK-STATEFUL-COUNT-30:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
-    // CHECK-STATELESS-COUNT-30: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-STATEFUL-COUNT-58:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS-COUNT-58: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_17 = atomic_update<atomic_op::cmpxchg>(
         acc, offsets, swap, compare, pred, props_a);
 
@@ -589,6 +625,98 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>(), props_a);
 
+    res_atomic_19 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view, pred, props_a);
+
+    res_atomic_20 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap,
+                                                      compare_view, props_a);
+
+    res_atomic_21 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare, pred, props_a);
+
+    res_atomic_22 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare, props_a);
+
+    res_atomic_23 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view, compare_view, pred, props_a);
+
+    res_atomic_24 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare_view, props_a);
+
+    res_atomic_25 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare, pred, props_a);
+
+    res_atomic_26 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare, props_a);
+
+    res_atomic_27 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap, compare_view, pred, props_a);
+
+    res_atomic_28 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare_view, props_a);
+
+    res_atomic_29 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare, pred, props_a);
+
+    res_atomic_30 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare, props_a);
+
+    res_atomic_31 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare_view, pred, props_a);
+
+    res_atomic_32 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare_view, props_a);
+
+    res_atomic_19 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_20 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view.select<VL, 1>(), props_a);
+
+    res_atomic_21 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare, pred, props_a);
+
+    res_atomic_22 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare, props_a);
+
+    res_atomic_23 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_24 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_25 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare, pred, props_a);
+
+    res_atomic_26 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare, props_a);
+
+    res_atomic_27 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_28 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_29 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        pred, props_a);
+
+    res_atomic_30 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        props_a);
+
+    res_atomic_31 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_32 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), props_a);
+
     {
       constexpr int VL = 8;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int);

From 353cc51fe9d8abee4c2868d73dcad7b429a778ce Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:39:51 -0700
Subject: [PATCH 32/55] [ESIMD] Allow full autodeduction of template parameters
 for atomic_update USM API accepting simd_view (#14043)

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 727 +++++++++++++++++-
 .../esimd/memory_properties_atomic_update.cpp | 156 +++-
 2 files changed, 824 insertions(+), 59 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 9619110600899..6272d8ce97d10 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -9263,8 +9263,37 @@ __ESIMD_API std::enable_if_t<
         detail::is_simd_view_type_v<OffsetSimdViewT>,
     simd<T, N>>
 atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset.read(), mask, props);
+  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c offsets represented as
+/// \c simd_view object without mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+///   \c atomic_op::dec, or \c atomic_op::load.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+///   Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
 }
 
 /// A variation of \c atomic_update API with \c offset represented as
@@ -9400,16 +9429,61 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, props = {});                  // (usm-au1-2)
-
-/// A variation of \c atomic_update API without mask operand.
-
+///               SrcSimdViewT src0, simd_mask<N> mask, props = {});
+///
+/// Atomically updates \c N memory locations represented by a USM pointer and
+/// a vector of offsets relative to the pointer, and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 1 additional argument.
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+///
 /// @tparam Op The atomic operation - can be one of the following:
 /// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
 /// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
 /// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
 /// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
 /// atomic_op::fsub, \c atomic_op::store.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, props = {});                  // (usm-au1-2)
+
+/// A variation of \c atomic_update API without mask operand.
+
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
@@ -9434,6 +9508,47 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
   return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, props = {});
+
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and no mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), props);
+}
+
 /// simd<T, N>
 /// atomic_update(T *p, OffsetSimdViewT byte_offset,
 ///               simd<T, N> src0,
@@ -9474,6 +9589,54 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0, simd_mask<N> mask,
   return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() &&
+          N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and offsets parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), mask, props);
+}
+
 /// simd<T, N>
 /// atomic_update(T *p, OffsetSimdViewT byte_offset,
 ///               simd<T, N> src0,
@@ -9513,6 +9676,48 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0,
   return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// \c simd_view object and no mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "offsets parameter.");
+  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), props);
+}
+
 /// A variation of \c atomic_update API with \c offset represented as
 /// scalar object.
 ///
@@ -9644,17 +9849,21 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(T *p, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {});                                  // (usm-au2-2)
-//
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
@@ -9662,27 +9871,33 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              simd<T, N> src1, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(T *p, OffsetSimdViewT byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               simd_mask<N> mask, props = {})                // (usm-au2-3)
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
 ///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
@@ -9694,28 +9909,370 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
+///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), mask,
                                  props);
 }
 
 /// simd<T, N>
-/// atomic_update(T *p, OffsetSimdViewT byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {})                                   // (usm-au2-4)
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
 ///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(), mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {});                                  // (usm-au2-2)
+//
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {})                // (usm-au2-3)
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(), mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must correspond "
+                "to the size of "
+                "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
+                                 src1.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {})                                   // (usm-au2-4)
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
@@ -9741,6 +10298,114 @@ atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
                                  props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must be equal.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
+                                 src1.read(), props);
+}
+
 /// A variation of \c atomic_update API with \c byte_offset represented as
 /// scalar.
 ///
diff --git a/sycl/test/esimd/memory_properties_atomic_update.cpp b/sycl/test/esimd/memory_properties_atomic_update.cpp
index aecf71759a3b3..403448e3677f0 100644
--- a/sycl/test/esimd/memory_properties_atomic_update.cpp
+++ b/sycl/test/esimd/memory_properties_atomic_update.cpp
@@ -87,17 +87,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_2 =
         atomic_update<atomic_op::inc, int>(ptr, offsets, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_3 =
         atomic_update<atomic_op::inc, int>(ptr, offsets_view, pred, props_a);
+    res_atomic_3 =
+        atomic_update<atomic_op::inc>(ptr, offsets_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_4 =
         atomic_update<atomic_op::inc, int, VL>(ptr, offsets_view, props_a);
+    res_atomic_4 = atomic_update<atomic_op::inc>(ptr, offsets_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_5 = atomic_update<atomic_op::inc, int, VL>(
         ptr, offsets_view.select<VL, 1>(), props_a);
+    res_atomic_5 = atomic_update<atomic_op::inc>(
+        ptr, offsets_view.select<VL, 1>(), props_a);
 
     // atomic_upate without cache hints:
     // CHECK: call <4 x i32> @llvm.genx.svm.atomic.inc.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> undef)
@@ -212,41 +217,59 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_1 =
         atomic_update<atomic_op::add, int>(ptr, offsets, add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_2 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets, add_view, pred, props_a);
+    res_atomic_2 =
+        atomic_update<atomic_op::add>(ptr, offsets, add_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_3 =
         atomic_update<atomic_op::add, int, VL>(ptr, offsets, add_view, props_a);
+    res_atomic_3 =
+        atomic_update<atomic_op::add>(ptr, offsets, add_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_3 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets, add_view.select<VL, 1>(), props_a);
+    res_atomic_3 = atomic_update<atomic_op::add>(
+        ptr, offsets, add_view.select<VL, 1>(), props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_4 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add, pred, props_a);
+    res_atomic_4 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_5 =
         atomic_update<atomic_op::add, int, VL>(ptr, offsets_view, add, props_a);
+    res_atomic_5 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_5 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view.select<VL, 1>(), add, props_a);
+    res_atomic_5 = atomic_update<atomic_op::add>(
+        ptr, offsets_view.select<VL, 1>(), add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_6 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add_view, pred, props_a);
+    res_atomic_6 = atomic_update<atomic_op::add>(ptr, offsets_view, add_view,
+                                                 pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_7 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add_view, props_a);
+    res_atomic_7 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_7 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
+    res_atomic_7 = atomic_update<atomic_op::add>(
+        ptr, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
 
     // atomic_update without cache hints:
     // CHECK: call <4 x i32> @llvm.genx.svm.atomic.add.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> undef)
@@ -381,67 +404,97 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_2 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets, swap, compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_3 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view, pred, props_a);
+    res_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     res_atomic_3 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
+    res_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_4 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view, props_a);
+    res_atomic_4 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap,
+                                                     compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_5 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare, pred, props_a);
+    res_atomic_5 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_6 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare, props_a);
+    res_atomic_6 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_7 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare_view, pred, props_a);
+    res_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_8 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare_view, props_a);
+    res_atomic_8 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_9 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets_view, swap, compare, pred, props_a);
+    res_atomic_9 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                     compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_10 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets_view, swap, compare, props_a);
+    res_atomic_10 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                      compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_11 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap, compare_view, pred, props_a);
+    res_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_12 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap, compare_view, props_a);
+    res_atomic_12 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                      compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_13 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare, pred, props_a);
+    res_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_14 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare, props_a);
+    res_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_15 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare_view, pred, props_a);
+    res_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare_view, props_a);
+    res_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare_view, props_a);
 
-    // CHECK-COUNT-13: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     res_atomic_4 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view.select<VL, 1>(), props_a);
 
@@ -489,6 +542,53 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>(), props_a);
 
+    res_atomic_4 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view.select<VL, 1>(), props_a);
+
+    res_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare, pred, props_a);
+
+    res_atomic_6 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare, props_a);
+
+    res_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare, pred, props_a);
+
+    res_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare, props_a);
+
+    res_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        pred, props_a);
+
+    res_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        props_a);
+
+    res_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), props_a);
+
     {
       constexpr int VL = 8;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int);

From f8552d40912ef2c97b58763abac99ee4d474d3f0 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:50:51 -0400
Subject: [PATCH 33/55] [SYCL][E2E] Disable memory_management_test3.cpp on
 Gen12 linux (#14087)

See https://github.com/intel/llvm/issues/14086

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
index 24f56d3105ee5..fa0b0c1d1ed26 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
@@ -29,7 +29,8 @@
 //
 //
 // ===----------------------------------------------------------------------===//
-
+// https://github.com/intel/llvm/issues/14086
+// UNSUPPORTED: gpu-intel-gen12 && linux
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 

From d48c3712182acac0153375ddc32a56bed2840abf Mon Sep 17 00:00:00 2001
From: Ian Li <54701975+ianayl@users.noreply.github.com>
Date: Thu, 6 Jun 2024 15:15:10 -0700
Subject: [PATCH 34/55] [SYCL][Matrix] Amend CODEOWNERS for check_device_code
 matrix tests (#14088)

CODEOWNERS seems to be missing a line attributing
`sycl/test/check_device_code/matrix` tests to
intel/sycl-matrix-reviewers (As per [this
discussion](https://github.com/intel/llvm/pull/14063#discussion_r1629880980)).
This PR remedies this.

Although, I noticed the current CODEOWNERS section for the matrix
reviewers uses paths; Let me know if I should use `sycl/**/matrix`
instead.
---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f23064da26319..c33b839077872 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -126,6 +126,7 @@ sycl/test-e2e/KernelFusion @intel/dpcpp-kernel-fusion-reviewers
 sycl/include/sycl/ext/oneapi/matrix/ @intel/sycl-matrix-reviewers
 sycl/test-e2e/Matrix @intel/sycl-matrix-reviewers
 sycl/test/matrix @intel/sycl-matrix-reviewers
+sycl/test/check_device_code/matrix @intel/sycl-matrix-reviewers
 
 # Native CPU
 llvm/**/*SYCLNativeCPU* @intel/dpcpp-nativecpu-pi-reviewers 

From e51a90ad03d44f27ffe043ada80a176fbe522b03 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 6 Jun 2024 15:34:35 -0700
Subject: [PATCH 35/55] [CI] Don't run E2E tests on self-hosted CUDA in Nightly
 (#14041)

The runner seems to be broken, don't run the tests until it's fixed.
---
 .github/workflows/sycl-nightly.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
index f5b3453e6db98..fc0b90be7990a 100644
--- a/.github/workflows/sycl-nightly.yml
+++ b/.github/workflows/sycl-nightly.yml
@@ -74,13 +74,6 @@ jobs:
             target_devices: opencl:cpu
             tests_selector: e2e
 
-          - name: Self-hosted CUDA
-            runner: '["Linux", "cuda"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
-            image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
-            target_devices: ext_oneapi_cuda:gpu
-            tests_selector: e2e
-
           - name: SYCL-CTS on OCL CPU
             runner: '["Linux", "gen12"]'
             image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest

From b925bd8cadd3b87941496cfa2bdc1e76c0e97533 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Fri, 7 Jun 2024 00:15:59 -0700
Subject: [PATCH 36/55] [SYCL] Add `vec<bfloat16>` support to math builtins
 (#14002)

This PR
1. Updates extension to add math builtins for `vec<bfloat16>` and
corresponding swizzles.
2. Implements **unoptimized** support for `vec<bloat16>`/swizzles in
math builtins and adds a test case for the same.
3. Adds a test to check the device code generated for `vec<bloat16>`
math builtins.

I will make a follow-up PR to optimize `vec<bfloat16>` math builtins. I
think we can use elementwise builtins for `ext_vector_type`
(https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors)
to optimize `vec<bfloat16>` math builtins. The device code test case
will help visualizing/reviewing math builtin optimizations.
---
 ...xt_oneapi_bfloat16_math_functions.asciidoc | 464 ++++++++++++++----
 .../ext/oneapi/experimental/bfloat16_math.hpp | 128 +++++
 .../BFloat16/bfloat16_vec_builtins.cpp        | 278 +++++++++++
 .../vector/vector_bf16_builtins.cpp           | 377 ++++++++++++++
 4 files changed, 1150 insertions(+), 97 deletions(-)
 create mode 100644 sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
 create mode 100644 sycl/test/check_device_code/vector/vector_bf16_builtins.cpp

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
index 3261a94b17cdf..6359515a67b9d 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
@@ -103,286 +103,556 @@ then it supports the `bfloat16` math functions described in the next section.
 
 === Math Functions
 
-The following functions are only available when `T` is `bfloat16` or
-`sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of
-`size_t` type.
-
 ==== isnan
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-bool isnan(bfloat16 x);
+bool isnan(bfloat16 x);                                                 (1)
 
-template <size_t N>
-sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x);
+template<typename NonScalar> 
+/*return type*/ isnan(NonScalar x);                                     (2)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
+====== Overload (1)
+
+Returns `true` if `x` is a NaN value, otherwise returns `false`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+ - `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+ - The element type is `bfloat16`.
 
-Returns true if x is NAN value, otherwise returns false.
+*Returns:* If `NonScalar` is `marray`, returns `true` for each element of `x` only if `x[i]` has a NaN value. If `NonScalar` is `vec` or the `[code]#+__swizzled_vec__+#` type, returns -1 for each element of `x` if `x[i]` is a NaN value and returns 0 otherwise.
+
+The return type depends on `NonScalar`. For `marray`, the return type is `marray<bool, N>` and for `vec`, `[code]#+__swizzled_vec__+#` type, the return type is `vec<int16_t, N>`.
 
 ==== fma
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-template <typename T>
-T fma(T a, T b, T c);
+bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c);                          (1)
+
+template<typename NonScalar1, typename NonScalar2, typename NonScalar3>    (2)
+/*return-type*/ fma(NonScalar1 a, NonScalar2 b, NonScalar3 c)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns the correctly rounded floating-point representation of the
+====== Overload (1)
+
+*Returns:* Returns the correctly rounded floating-point representation of the
 sum of `c` with the infinitely precise product of `a` and `b`.
 Rounding of intermediate products shall not occur. The mantissa
 LSB rounds to the nearest even. Subnormal numbers are supported.
 
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1`, `NonScalar2`, and `NonScalar3`:
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are each `marray`; or
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same number of elements;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same element type; and
+* The element type of `NonScalar1`, `NonScalar2`, and `NonScalar3` is `bfloat16`.
+
+*Returns:* For each element of `a`, `b`, and `c`; the correctly rounded floating-point representation of the sum of `c[i]` with the infinitely precise product of `a[i]` and `b[i]`. Rounding of intermediate products shall not occur. Edge case behavior is per the IEEE 754-2008 standard.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
 ==== fmax
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmax(T x, T y);
+bfloat16 fmax(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmax(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmax(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`x < y`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `y` if `x < y`, otherwise it returns `x`. If one argument is a NaN, `fmax()` returns the other
+argument. If both arguments are NaNs, `fmax()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `y[i]` if `x[i] < y[i]`, otherwise `x[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `y` if `x[i] < y`, otherwise `x[i]`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fmin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmin(T x, T y);
+bfloat16 fmin(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmin(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmin(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`y < x`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `x` if `x < y`, otherwise it returns `y`. If one argument is a
+NaN, `fmin()` returns the other argument. If both arguments are NaNs, `fmin()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `x[i]` if `x[i] < y[i]`, otherwise `y[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `x[i]` if `x[i] < y`, otherwise `y`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fabs
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fabs(T x);
+bfloat16 fabs(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ fabs(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Compute absolute value(s) of a scalar `bfloat16` value.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute absolute value of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the absolute value of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== ceil
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T ceil(T x);
+bfloat16 ceil(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ ceil(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to positive infinity rounding mode.
+
+====== Overload (2)
 
-Returns `x` rounded to an integral value using the round to positive infinity rounding mode
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to positive infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== floor
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T floor(T x);
+bfloat16 floor(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ floor(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to negative infinity rounding mode.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Returns `x` rounded to an integral value using the round to negative infinity rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to negative infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== cos
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T cos(T x);
+bfloat16 cos(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ cos(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the cosine of `x`.
 
-Compute cosine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the cosine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sin(T x);
+bfloat16 sin(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ sin(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the sine of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute sine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
+*Returns:* For each element of `x`, the sine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp(T x);
+bfloat16 exp(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-e exponential of `x`.
+
+====== Overload (2)
 
-Compute the base-e exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-e exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp2(T x);
+bfloat16 exp2(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 exponential of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute the base-2 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the base-2 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp10(T x);
+bfloat16 exp10(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ exp10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 exponential of `x`.
 
-Compute the base-10 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log(T x);
+bfloat16 log(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ log(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the natural logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute natural logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the natural logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log2(T x);
+bfloat16 log2(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ log2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-2 logarithm of `x[i]`.
 
-Compute base-2 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log10(T x);
+bfloat16 log10(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                             (2)
+/*return-type*/ log10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute base-10 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 
 ==== rint
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rint(T x);
+bfloat16 rint(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rint(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to nearest even rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. Refer to section 7.1 of the OpenCL 1.2 specification document: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#opencl12 for a description of the rounding modes.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value (using round to nearest even rounding mode) in floating-point format.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sqrt(T x);
+bfloat16 sqrt(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ sqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the square root of `x`.
+
+====== Overload (2)
 
-Compute square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== rsqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rsqrt(T x);
+bfloat16 rsqrt(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rsqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the inverse square root of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute inverse square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the inverse square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== trunc
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T trunc(T x);
+bfloat16 trunc(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ trunc(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to zero rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value using the round to zero rounding mode.
 
-== Issues
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-1. The CUDA backend does not have a use case that would necessitate support
-of the `vec` class in bfloat16 math functions, and `marray` would always be
-preferred over `vec` if `vec` support were to be added in the CUDA backend.
-For portability reasons, support for the `vec` class can be easily added if
-other backends require it.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to zero rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+== Issues
 
-2. We should decide on a roadmap to extend support of `bfloat16` to other
+1. We should decide on a roadmap to extend support of `bfloat16` to other
 SYCL 2020 math functions.
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
index 2b611f46ddadd..fb4b49a44d4d3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <sycl/builtins.hpp>            // for ceil, cos, exp, exp10, exp2
+#include <sycl/builtins_utils_vec.hpp>  // For simplify_if_swizzle, is_swizzle
 #include <sycl/detail/memcpy.hpp>       // sycl::detail::memcpy
 #include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16, bfloat16ToBits
 #include <sycl/marray.hpp>              // for marray
@@ -30,6 +31,17 @@ uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
 }
 } // namespace detail
 
+// Trait to check if the type is a vector or swizzle of bfloat16.
+template <typename T>
+constexpr bool is_vec_or_swizzle_bf16_v =
+    sycl::detail::is_vec_or_swizzle_v<T> &&
+    sycl::detail::is_valid_elem_type_v<T, bfloat16>;
+
+template <typename T>
+constexpr int num_elements_v = sycl::detail::num_elements<T>::value;
+
+/******************* isnan ********************/
+
 // According to bfloat16 format, NAN value's exponent field is 0xFF and
 // significand has non-zero bits.
 template <typename T>
@@ -46,6 +58,21 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<int16_t, N>>
+isnan(T x) {
+  sycl::vec<int16_t, N> res;
+  for (size_t i = 0; i < N; i++) {
+    // The result of isnan is 0 or 1 but SPEC requires
+    // isnan() of vec/swizzle to return -1 or 0.
+    res[i] = isnan(x[i]) ? -1 : 0;
+  }
+  return res;
+}
+
+/******************* fabs ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -89,6 +116,19 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>>
+fabs(T x) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fabs(x[i]);
+  }
+  return res;
+}
+
+/******************* fmin ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -146,6 +186,22 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmin(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fmax ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -202,6 +258,22 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmax(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fma *********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fma(T x, T y, T z) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -248,6 +320,22 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, typename T3, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>, int N3 = num_elements_v<T3>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     is_vec_or_swizzle_bf16_v<T3> && N1 == N2 && N2 == N3,
+                 sycl::vec<bfloat16, N1>>
+fma(T1 x, T2 y, T3 z) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fma(x[i], y[i], z[i]);
+  }
+  return res;
+}
+
+/******************* unary math operations ********************/
+
 #define BFLOAT16_MATH_FP32_WRAPPERS(op)                                        \
   template <typename T>                                                        \
   std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) {              \
@@ -264,37 +352,77 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
     return res;                                                                \
   }
 
+#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
+  /* Overload for BF16 vec and swizzles. */                                    \
+  template <typename T, int N = num_elements_v<T>>                             \
+  std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>> op(    \
+      T x) {                                                                   \
+    sycl::vec<bfloat16, N> res;                                                \
+    for (size_t i = 0; i < N; i++) {                                           \
+      res[i] = op(x[i]);                                                       \
+    }                                                                          \
+    return res;                                                                \
+  }
+
 BFLOAT16_MATH_FP32_WRAPPERS(ceil)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil)
+
 BFLOAT16_MATH_FP32_WRAPPERS(cos)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(floor)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rint)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rsqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sin)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(trunc)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc)
 
 #undef BFLOAT16_MATH_FP32_WRAPPERS
 #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
+#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC
 } // namespace ext::oneapi::experimental
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
new file mode 100644
index 0000000000000..481aa35e3cedf
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
@@ -0,0 +1,278 @@
+// RUN: %{build} -fno-fast-math -o %t.out
+// RUN: %{run} %t.out
+
+// Test new, ABI-breaking for all platforms.
+// RUN:  %if preview-breaking-changes-supported %{  %{build} -fpreview-breaking-changes -o %t-pfrev.out %}
+// RUN:  %if preview-breaking-changes-supported %{  %{run} %t-pfrev.out  %}
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr float bf16_eps = 0.00390625;
+
+bool check(float a, float b) {
+  return sycl::fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+bool check(bool a, bool b) { return (a != b); }
+
+#define TEST_UNARY_OP(NAME, SZ, RETTY, INPVAL)                                 \
+  {                                                                            \
+    vec<bfloat16, SZ> arg;                                                     \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg);                      \
+    vec<RETTY, 2> res2 =                                                       \
+        sycl::ext::oneapi::experimental::NAME(arg.template swizzle<0, 0>());   \
+    /* Check the result. */                                                    \                   
+    if (res2[0] != res[0] || res2[1] != res[0]) {                              \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL))) {                                 \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BINARY_OP(NAME, SZ, RETTY, INPVAL)                                \
+  {                                                                            \
+    vec<bfloat16, SZ> arg, arg2;                                               \
+    bfloat16 inpVal2 = 1.0f;                                                   \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+      arg2[i] = inpVal2;                                                       \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg, arg2);                \
+    /* Swizzle and vec different combination. */                               \
+    vec<RETTY, 2> res2 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), arg2.template swizzle<0, 0>());          \
+    vec<RETTY, 2> res3 = sycl::ext::oneapi::experimental::NAME(                \
+        vec<bfloat16, 2>(arg[0], arg[0]), arg2.template swizzle<0, 0>());      \
+    vec<RETTY, 2> res4 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]));     \
+    /* Check the result. */                                                    \
+    if (res2[0] != res[0] || res2[1] != res[0] || res3[0] != res[0] ||         \
+        res3[1] != res[0] || res4[0] != res[0] || res4[1] != res[0]) {         \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL, inpVal2))) {                        \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BUILTIN_VEC(NAME, SZ, RETTY, INPVAL, OPTEST)                      \
+  { /* On Device */                                                            \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf,      \
+                                                                 cgh);         \
+       cgh.single_task([=]() { OPTEST(NAME, SZ, RETTY, INPVAL) });             \
+     }).wait();                                                                \
+  }                                                                            \
+  assert(err == 0);                                                            \
+  { /* On Host */                                                              \
+    int ERR[1] = {0};                                                          \
+    OPTEST(NAME, SZ, RETTY, INPVAL)                                            \
+    assert(ERR[0] == 0);                                                       \
+  }
+
+#define TEST_BUILTIN_UNARY(NAME, RETTY, INPVAL)                                \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_UNARY_OP)
+
+#define TEST_BUILTIN_BINARY(NAME, RETTY, INPVAL)                               \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_BINARY_OP)
+
+void test() {
+  queue q;
+  int err = 0;
+  float nan = std::nanf("");
+
+  // Test isnan on host
+  {
+    vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+    vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+    assert((res[0] == 0 && res[1] == -1 && res[2] == 0) &&
+           "isnan() failed on host for vec");
+
+    // Test for swizzles
+    vec<int16_t, 2> res2 = sycl::ext::oneapi::experimental::isnan(arg.lo());
+    assert((res2[0] == 0 && res2[1] == -1) &&
+           "isnan() failed on host for vec swizzles");
+  }
+
+  // Tets isnan on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+         vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+         if (res[0] != 0 || res[1] != -1 || res[2] != 0) {
+           ERR[0] += 1;
+         }
+       });
+     }).wait();
+    assert(err == 0 && "isnan failed on device for vec");
+  }
+
+  // Unary math builtins.
+  TEST_BUILTIN_UNARY(fabs, bfloat16, -1.0f);
+  TEST_BUILTIN_UNARY(fabs, bfloat16, 1.0f);
+
+  TEST_BUILTIN_UNARY(cos, bfloat16, 0.1f);
+  TEST_BUILTIN_UNARY(sin, bfloat16, 0.2f);
+
+  TEST_BUILTIN_UNARY(ceil, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(floor, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(trunc, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp10, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp2, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rint, bfloat16, 0.9f);
+
+  TEST_BUILTIN_UNARY(sqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rsqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(log, bfloat16, 20.0f);
+  TEST_BUILTIN_UNARY(log2, bfloat16, 2.0f);
+  TEST_BUILTIN_UNARY(log10, bfloat16, 2.0f);
+
+  TEST_BUILTIN_BINARY(fmin, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmin, bfloat16, nan);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, nan);
+
+  // Test fma operation on host.
+  {
+    vec<bfloat16, 3> arg1, arg2, arg3;
+    bfloat16 inpVal1 = 1.0f;
+    bfloat16 inpVal2 = 2.0f;
+    bfloat16 inpVal3 = 3.0f;
+    /* Initialize the vector with INPVAL */
+    for (int i = 0; i < 3; i++) {
+      arg1[i] = inpVal1;
+      arg2[i] = inpVal2;
+      arg3[i] = inpVal3;
+    }
+    /* Perform the operation. */
+    auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+    // Test different combination of vec an swizzle.
+    auto res1 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res2 = sycl::ext::oneapi::experimental::fma(
+        vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res3 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+        arg3.template swizzle<0, 0>());
+
+    auto res4 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+    /* Check the result. */
+    if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+        res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+        res4[0] != res[0] || res4[1] != res[0]) {
+      err += 1;
+    }
+    for (int i = 0; i < 3; i++) {
+      if (check(res[i], sycl::ext::oneapi::experimental::fma(inpVal1, inpVal2,
+                                                             inpVal3))) {
+        err += 1;
+      }
+    }
+    assert(err == 0);
+  }
+
+  // Test fma on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg1, arg2, arg3;
+         bfloat16 inpVal1 = 1.0f;
+         bfloat16 inpVal2 = 2.0f;
+         bfloat16 inpVal3 = 3.0f;
+         /* Initialize the vector with INPVAL */
+         for (int i = 0; i < 3; i++) {
+           arg1[i] = inpVal1;
+           arg2[i] = inpVal2;
+           arg3[i] = inpVal3;
+         }
+         /* Perform the operation. */
+         auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+         // Test different combination of vec an swizzle.
+         auto res1 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res2 = sycl::ext::oneapi::experimental::fma(
+             vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res3 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+             arg3.template swizzle<0, 0>());
+
+         auto res4 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+         /* Check the result. */
+         if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+             res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+             res4[0] != res[0] || res4[1] != res[0]) {
+           ERR[0] += 1;
+         }
+         for (int i = 0; i < 3; i++) {
+           if (check(res[i], sycl::ext::oneapi::experimental::fma(
+                                 inpVal1, inpVal2, inpVal3))) {
+             ERR[0] += 1;
+           }
+         }
+       });
+     }).wait();
+    assert(err == 0);
+  }
+}
+
+int main() {
+
+  test();
+  return 0;
+}
diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
new file mode 100644
index 0000000000000..6aea590b6155c
--- /dev/null
+++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
@@ -0,0 +1,377 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// NOTE: ..., followed by some manual cleanup.
+
+// Had to increase inline threashold for this test to force inline of the vec<>
+// math builtins.
+// RUN: %clangxx -I %sycl_include -mllvm -inline-threshold=400 -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -O3 -fsycl-device-only %s -o - | FileCheck %s
+
+// This test checks the device code generated for vec<bfloat16> math builtins.
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMinN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 2
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS0_3VECINS2_8BFLOAT16ELI2EEES7_LI2ELI2EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10:![0-9]+]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META15:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META15]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp olt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META18:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META18]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ -32768, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS0_3vecINS2_8bfloat16ELi2EEES7_Li2ELi2EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMin(vec<bfloat16, 2> a, vec<bfloat16, 2> b) {
+  return experimental::fmin(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMaxN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META22:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i64 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS0_3VECINS2_8BFLOAT16ELI3EEES7_LI3ELI3EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META29:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META29]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ 0, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS0_3vecINS2_8bfloat16ELi3EEES7_Li3ELi3EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMax(vec<bfloat16, 3> a, vec<bfloat16, 3> b) {
+  return experimental::fmax(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z9TestIsNanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.2", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL5ISNANINS0_3VECINS2_8BFLOAT16ELI4EEELI4EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_ISXT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]], !noalias [[META32]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]], !noalias [[META32]]
+// CHECK-NEXT:    [[CONV_I_I:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I:%.*]] = and i32 [[CONV_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp ne i32 [[AND2_I_I]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_I_I]]
+// CHECK-NEXT:    [[CONV2_I:%.*]] = sext i1 [[TMP2]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[AGG_RESULT]], i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV2_I]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA10]], !alias.scope [[META32]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental5isnanINS0_3vecINS2_8bfloat16ELi4EEELi4EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IsXT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestIsNan(vec<bfloat16, 4> a) {
+  return experimental::isnan(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFabsN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR3]] !srcloc [[META36:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FABSINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP0]], 32767
+// CHECK-NEXT:    [[SPEC_SELECT_I_I:%.*]] = select i1 [[TMP1]], i16 [[TMP0]], i16 [[TMP2]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[SPEC_SELECT_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fabsINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFabs(vec<bfloat16, 8> a) {
+  return experimental::fabs(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestCeilN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP6_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP6_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4CEILINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP6_I]], align 1, !noalias [[META39]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z16__spirv_ocl_ceilf(float noundef [[CALL_I_I_I_I]]) #[[ATTR9:[0-9]+]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45:![0-9]+]], !noalias [[META47:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I2_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4ceilINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestCeil(vec<bfloat16, 8> a) {
+  return experimental::ceil(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z7TestFMAN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEES5_S5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.4") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META49:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP416_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP115_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP14_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP25:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[A]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[B]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[C]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP2_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP25]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP1_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP14]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP3]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP3]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP14]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP25]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP14_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP115_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP416_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL3FMAINS0_3VECINS2_8BFLOAT16ELI16EEES7_S7_LI16ELI16ELI16EEENST9ENABLE_IFIXAAAAAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_E24IS_VEC_OR_SWIZZLE_BF16_VIT1_EEQT2_T3_EQT3_T4_ENS5_IS6_XT2_EEEE4TYPEES9_SA_SB__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL6_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Z_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL6_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP416_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[AGG_TMP115_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP14_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I4_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I5_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Z_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z15__spirv_ocl_fmafff(float noundef [[CALL_I_I_I_I]], float noundef [[CALL_I_I4_I_I]], float noundef [[CALL_I_I5_I_I]]) #[[ATTR9]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45]], !noalias [[META56:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I6_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    [[CALL8_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I6_I_I]], ptr addrspace(4) [[CALL8_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP57:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental3fmaINS0_3vecINS2_8bfloat16ELi16EEES7_S7_Li16ELi16ELi16EEENSt9enable_ifIXaaaaaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_E24is_vec_or_swizzle_bf16_vIT1_EeqT2_T3_eqT3_T4_ENS5_IS6_XT2_EEEE4typeES9_SA_SB_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMA(vec<bfloat16, 16> a, vec<bfloat16, 16> b,
+                           vec<bfloat16, 16> c) {
+  return experimental::fma(a, b, c);
+}

From 3d9ded6e518e42991abffc9397341b9c3dc8e741 Mon Sep 17 00:00:00 2001
From: Ian Li <54701975+ianayl@users.noreply.github.com>
Date: Fri, 7 Jun 2024 03:09:04 -0700
Subject: [PATCH 37/55] [SYCL] Change check_device_code CUDA tests to use
 SYCL_EXTERNAL (#13943)

Changed CUDA sycl/test/check_device_code lit test cases to use
SYCL_EXTERNAL functions instead of submitting kernels to the queue
everytime.
---
 sycl/test/check_device_code/cuda/ldg.cpp      | 482 ++++++++----------
 .../matrix/matrix-nvptx-bfloat16-test.cpp     | 439 ++++++++--------
 .../cuda/matrix/matrix-nvptx-double-test.cpp  | 168 +++---
 .../matrix/matrix-nvptx-half-float-test.cpp   | 439 ++++++++--------
 .../matrix/matrix-nvptx-half-half-test.cpp    | 440 ++++++++--------
 .../cuda/matrix/matrix-nvptx-int8-test.cpp    | 460 +++++++++--------
 .../cuda/matrix/matrix-nvptx-tf32-test.cpp    | 203 ++++----
 .../cuda/matrix/matrix-nvptx-uint8-test.cpp   | 440 ++++++++--------
 .../math-builtins/native-math-cuda.cpp        | 103 ++--
 9 files changed, 1584 insertions(+), 1590 deletions(-)

diff --git a/sycl/test/check_device_code/cuda/ldg.cpp b/sycl/test/check_device_code/cuda/ldg.cpp
index f0fd4ac9deef8..e9ed4ba8a51ca 100644
--- a/sycl/test/check_device_code/cuda/ldg.cpp
+++ b/sycl/test/check_device_code/cuda/ldg.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "ldg" --include-generated-funcs --version 4
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE
@@ -9,279 +10,208 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::cuda;
 using namespace sycl::ext::oneapi::experimental;
 
-int main() {
-
-  sycl::queue q;
-
-  auto *in_c = sycl::malloc_device<char>(1, q);
-  auto *in_sc = sycl::malloc_device<signed char>(1, q);
-  auto *in_s = sycl::malloc_device<short>(1, q);
-  auto *in_i = sycl::malloc_device<int>(1, q);
-  auto *in_l = sycl::malloc_device<long>(1, q);
-  auto *in_ll = sycl::malloc_device<long long>(1, q);
-
-  auto *in_uc = sycl::malloc_device<unsigned char>(1, q);
-  auto *in_us = sycl::malloc_device<unsigned short>(1, q);
-  auto *in_ui = sycl::malloc_device<unsigned int>(1, q);
-  auto *in_ul = sycl::malloc_device<unsigned long>(1, q);
-  auto *in_ull = sycl::malloc_device<unsigned long long>(1, q);
-
-  auto *in_c2 = sycl::malloc_device<sycl::vec<char, 2>>(1, q);
-  auto *in_c3 = sycl::malloc_device<sycl::vec<char, 3>>(1, q);
-  auto *in_sc2 = sycl::malloc_device<sycl::vec<signed char, 2>>(1, q);
-  auto *in_sc3 = sycl::malloc_device<sycl::vec<signed char, 3>>(1, q);
-  auto *in_s2 = sycl::malloc_device<sycl::vec<short, 2>>(1, q);
-  auto *in_s3 = sycl::malloc_device<sycl::vec<short, 3>>(1, q);
-  auto *in_i2 = sycl::malloc_device<sycl::vec<int, 2>>(1, q);
-  auto *in_i3 = sycl::malloc_device<sycl::vec<int, 3>>(1, q);
-  auto *in_l2 = sycl::malloc_device<sycl::vec<long, 2>>(1, q);
-  auto *in_l3 = sycl::malloc_device<sycl::vec<long, 3>>(1, q);
-  auto *in_ll2 = sycl::malloc_device<sycl::vec<long long, 2>>(1, q);
-  auto *in_ll3 = sycl::malloc_device<sycl::vec<long long, 3>>(1, q);
-  auto *in_l4 = sycl::malloc_device<sycl::vec<long, 4>>(1, q);
-  auto *in_ll4 = sycl::malloc_device<sycl::vec<long long, 4>>(1, q);
-
-  auto *in_c4 = sycl::malloc_device<sycl::vec<char, 4>>(1, q);
-  auto *in_sc4 = sycl::malloc_device<sycl::vec<signed char, 4>>(1, q);
-  auto *in_s4 = sycl::malloc_device<sycl::vec<short, 4>>(1, q);
-  auto *in_i4 = sycl::malloc_device<sycl::vec<int, 4>>(1, q);
-
-  auto *in_uc2 = sycl::malloc_device<sycl::vec<unsigned char, 2>>(1, q);
-  auto *in_uc3 = sycl::malloc_device<sycl::vec<unsigned char, 3>>(1, q);
-  auto *in_us2 = sycl::malloc_device<sycl::vec<unsigned short, 2>>(1, q);
-  auto *in_us3 = sycl::malloc_device<sycl::vec<unsigned short, 3>>(1, q);
-  auto *in_ui2 = sycl::malloc_device<sycl::vec<unsigned int, 2>>(1, q);
-  auto *in_ui3 = sycl::malloc_device<sycl::vec<unsigned int, 3>>(1, q);
-  auto *in_ul2 = sycl::malloc_device<sycl::vec<unsigned long, 2>>(1, q);
-  auto *in_ul3 = sycl::malloc_device<sycl::vec<unsigned long, 3>>(1, q);
-  auto *in_ull2 = sycl::malloc_device<sycl::vec<unsigned long long, 2>>(1, q);
-  auto *in_ull3 = sycl::malloc_device<sycl::vec<unsigned long long, 3>>(1, q);
-  auto *in_ul4 = sycl::malloc_device<sycl::vec<unsigned long, 4>>(1, q);
-  auto *in_ull4 = sycl::malloc_device<sycl::vec<unsigned long long, 4>>(1, q);
-
-  auto *in_uc4 = sycl::malloc_device<sycl::vec<unsigned char, 4>>(1, q);
-  auto *in_us4 = sycl::malloc_device<sycl::vec<unsigned short, 4>>(1, q);
-  auto *in_ui4 = sycl::malloc_device<sycl::vec<unsigned int, 4>>(1, q);
-
-  auto *in_h = sycl::malloc_device<half>(1, q);
-  auto *in_f = sycl::malloc_device<float>(1, q);
-  auto *in_d = sycl::malloc_device<double>(1, q);
-
-  auto *in_h2 = sycl::malloc_device<sycl::vec<half, 2>>(1, q);
-  auto *in_h3 = sycl::malloc_device<sycl::vec<half, 3>>(1, q);
-  auto *in_h4 = sycl::malloc_device<sycl::vec<half, 4>>(1, q);
-  auto *in_f2 = sycl::malloc_device<sycl::vec<float, 2>>(1, q);
-  auto *in_f3 = sycl::malloc_device<sycl::vec<float, 3>>(1, q);
-  auto *in_f4 = sycl::malloc_device<sycl::vec<float, 4>>(1, q);
-  auto *in_d2 = sycl::malloc_device<sycl::vec<double, 2>>(1, q);
-  auto *in_d3 = sycl::malloc_device<sycl::vec<double, 3>>(1, q);
-  auto *in_d4 = sycl::malloc_device<sycl::vec<double, 4>>(1, q);
-
-  q.wait();
-
-  q.submit([=](sycl::handler &h) {
-    h.single_task<class check>([=] {
-      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2)
-      auto cached_h = ldg(&in_h[0]);
-      //CHECK-OPAQUE: tail call noundef float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4)
-      auto cached_f = ldg(&in_f[0]);
-      //CHECK-OPAQUE: tail call noundef double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8)
-      auto cached_d = ldg(&in_d[0]);
-
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      auto cached_h2 = ldg(&in_h2[0]);
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_h3 = ldg(&in_h3[0]);
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_h4 = ldg(&in_h4[0]);
-      //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
-      auto cached_f2 = ldg(&in_f2[0]);
-      //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_f3 = ldg(&in_f3[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      auto cached_d2 = ldg(&in_d2[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_d3 = ldg(&in_d3[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_d4 = ldg(&in_d4[0]);
-      //CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16)
-      auto cached_f4 = ldg(&in_f4[0]);
-
-      // Unsigned variants are identical to signed variants, but this leads to
-      // correct behavior.
-
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_c = ldg(&in_c[0]);
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_sc = ldg(&in_sc[0]);
-      //CHECK-OPAQUE: tail call noundef i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
-      auto cached_s = ldg(&in_s[0]);
-      //CHECK-OPAQUE: tail call noundef i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
-      auto cached_i = ldg(&in_i[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_l = ldg(&in_l[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ll = ldg(&in_ll[0]);
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_uc = ldg(&in_uc[0]);
-      //CHECK-OPAQUE: tail call noundef i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
-      auto cached_us = ldg(&in_us[0]);
-      //CHECK-OPAQUE: tail call noundef i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
-      auto cached_ui = ldg(&in_ui[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ul = ldg(&in_ul[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ull = ldg(&in_ull[0]);
-
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_c2 = ldg(&in_c2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_c3 = ldg(&in_c3[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_sc2 = ldg(&in_sc2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_sc3 = ldg(&in_sc3[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      auto cached_s2 = ldg(&in_s2[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_s3 = ldg(&in_s3[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      auto cached_i2 = ldg(&in_i2[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_i3 = ldg(&in_i3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      auto cached_l2 = ldg(&in_l2[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_l3 = ldg(&in_l3[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      auto cached_ll2 = ldg(&in_ll2[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ll3 = ldg(&in_ll3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
-      auto cached_l4 = ldg(&in_l4[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_ll4 = ldg(&in_ll4[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_uc2 = ldg(&in_uc2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_uc3 = ldg(&in_uc3[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      auto cached_us2 = ldg(&in_us2[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_us3 = ldg(&in_us3[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      auto cached_ui2 = ldg(&in_ui2[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_ui3 = ldg(&in_ui3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      auto cached_ul2 = ldg(&in_ul2[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ul3 = ldg(&in_ul3[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      auto cached_ull2 = ldg(&in_ull2[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ull3 = ldg(&in_ull3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
-      auto cached_ul4 = ldg(&in_ul4[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_ull4 = ldg(&in_ull4[0]);
-
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_c4 = ldg(&in_c4[0]);
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_sc4 = ldg(&in_sc4[0]);
-      //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
-      auto cached_s4 = ldg(&in_s4[0]);
-      //CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
-      auto cached_i4 = ldg(&in_i4[0]);
-
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_uc4 = ldg(&in_uc4[0]);
-      //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
-      auto cached_us4 = ldg(&in_us4[0]);
-      //CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
-      auto cached_ui4 = ldg(&in_ui4[0]);
-    });
-  });
-
-  q.wait();
-
-  free(in_h, q);
-  free(in_f, q);
-  free(in_d, q);
-  free(in_h2, q);
-  free(in_h3, q);
-  free(in_h4, q);
-  free(in_f2, q);
-  free(in_f3, q);
-  free(in_f4, q);
-  free(in_d2, q);
-  free(in_d3, q);
-  free(in_d4, q);
-  free(in_c, q);
-  free(in_sc, q);
-  free(in_s, q);
-  free(in_i, q);
-  free(in_l, q);
-  free(in_ll, q);
-  free(in_uc, q);
-  free(in_us, q);
-  free(in_ui, q);
-  free(in_ul, q);
-  free(in_ull, q);
-  free(in_c2, q);
-  free(in_c3, q);
-  free(in_sc2, q);
-  free(in_sc3, q);
-  free(in_s2, q);
-  free(in_s3, q);
-  free(in_i2, q);
-  free(in_i3, q);
-  free(in_l2, q);
-  free(in_l3, q);
-  free(in_ll2, q);
-  free(in_ll3, q);
-  free(in_l4, q);
-  free(in_ll4, q);
-  free(in_uc2, q);
-  free(in_uc3, q);
-  free(in_us2, q);
-  free(in_us3, q);
-  free(in_ui2, q);
-  free(in_ui3, q);
-  free(in_ul2, q);
-  free(in_ul3, q);
-  free(in_ull2, q);
-  free(in_ull3, q);
-  free(in_ul4, q);
-  free(in_ull4, q);
-  free(in_c4, q);
-  free(in_sc4, q);
-  free(in_s4, q);
-  free(in_i4, q);
-  free(in_uc4, q);
-  free(in_us4, q);
-  free(in_ui4, q);
-
-  return 0;
-};
+// CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL half
+sycl::ext::oneapi::experimental::cuda::ldg(const half *);
+// CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL float
+sycl::ext::oneapi::experimental::cuda::ldg(const float *);
+// CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL double
+sycl::ext::oneapi::experimental::cuda::ldg(const double *);
+
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<half, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 2> *);
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<half, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 3> *);
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<half, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 4> *);
+// CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<float, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 2> *);
+// CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<float, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 3> *);
+// CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<float, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 4> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<double, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 2> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<double, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 3> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<double, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 4> *);
+
+// Unsigned variants are identical to signed variants, but this leads to
+// correct behavior.
+
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL char
+sycl::ext::oneapi::experimental::cuda::ldg(const char *);
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL signed char
+sycl::ext::oneapi::experimental::cuda::ldg(const signed char *);
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL short
+sycl::ext::oneapi::experimental::cuda::ldg(const short *);
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL int
+sycl::ext::oneapi::experimental::cuda::ldg(const int *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL long
+sycl::ext::oneapi::experimental::cuda::ldg(const long *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL long long
+sycl::ext::oneapi::experimental::cuda::ldg(const long long *);
+
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL unsigned char
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned char *);
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL unsigned short
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned short *);
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL unsigned int
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned int *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL unsigned long
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned long *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL unsigned long long
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned long long *);
+
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<signed char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<signed char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<short, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 2> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<short, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 3> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<int, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 2> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<int, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<long long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 4> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 4> *);
+
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 2> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 3> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 2> *);
+
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 4> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 4> *);
+
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<signed char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<short, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 4> *);
+// CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<int, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 4> *);
+
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 4> *);
+// CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 4> *);
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
index 9f99cb6ea9457..dc6cd06270433 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
@@ -10,215 +10,230 @@ using sycl::ext::oneapi::bfloat16;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
-  buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
index f4a79d2756937..750632ca80243 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
@@ -15,96 +15,78 @@ constexpr int N = 8; // number of cols of accumulator,
                      // number of rows of a.
 constexpr int K = 4; // number of cols of a/number of rows of b.
 
-double A[M * K];
-double B[K * N];
-double C[M * N];
-double D[M * N];
-
-int main() {
-
-  buffer<double, 1> bufA(A, range<1>(M * K));
-  buffer<double, 1> bufB(B, range<1>(K * N));
-  buffer<double, 1> bufC(C, range<1>(M * N));
-  buffer<double, 1> bufD(D, range<1>(M * N));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
-          joint_matrix<sub_group, double, use::a, M, K, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, double, use::b, K, N, layout::row_major>
-              sub_b{};
-
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.row.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
-          joint_matrix<sub_group, double, use::a, M, K, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, double, use::b, K, N, layout::col_major>
-              sub_b{};
-
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              M, layout::col_major);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              M);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.col.col.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              M, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n8k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
+  joint_matrix<sub_group, double, use::a, M, K, layout::row_major> sub_a{};
+  joint_matrix<sub_group, double, use::b, K, N, layout::row_major> sub_b{};
+
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.row.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n8k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
+  joint_matrix<sub_group, double, use::a, M, K, layout::col_major> sub_a{};
+  joint_matrix<sub_group, double, use::b, K, N, layout::col_major> sub_b{};
+
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), M,
+                    layout::col_major);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), M);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.col.col.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), M,
+                     layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
index cb5b3da54b794..a3e7e61a94b20 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
@@ -9,215 +9,230 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<half, 1> bufA(nullptr, range<1>(1));
-  buffer<half, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n8k4(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<half, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<float, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<float, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
index feea65a79848b..602fff0a038ba 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
@@ -9,215 +9,231 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<half, 1> bufA(nullptr, range<1>(1));
-  buffer<half, 1> bufB(nullptr, range<1>(1));
-  buffer<half, 1> bufC(nullptr, range<1>(1));
-  buffer<half, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
index 492313dbaf71d..ed1d8b0c62221 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
@@ -9,215 +9,251 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<int8_t, 1> bufA(nullptr, range<1>(1));
-  buffer<int8_t, 1> bufB(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+// The following SYCL_EXTERNAL functions (e.g. row_row_m16n16k16) test perform
+// matrix multiplication in various different ways. They were originally written
+// in the following manner:
+//
+//  ...
+//  q.submit([&] (handler &cgh) {
+//      sycl::accessor<int8_t,  1, sycl::access::mode::read_write,
+//      sycl::target::device> accA(bufA, cgh); sycl::accessor<int8_t,  1,
+//      sycl::access::mode::read_write, sycl::target::device> accB(bufB, cgh);
+//      sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+//      sycl::target::device> accC(bufC, cgh); sycl::accessor<int32_t, 1,
+//      sycl::access::mode::read_write, sycl::target::device> accD(bufD, cgh);
+
+//      cgh.parallel_for<class row_row_m16n16k16>(nd_range<2>({1, 32}, {1, 32}),
+//          [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+//              row_row_m16n16k16(accA, accB, accC, accD, item);
+//          });
+//  });
+//
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
index e9200d930de46..d1298c6c3f862 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
@@ -32,104 +32,105 @@ constexpr int N = 16; // number of cols of accumulator,
                       // number of rows of a.
 constexpr int K = 8;  // number of cols of a/number of rows of b.
 
-// float is used in this test as the storage type for tf32
-float A[M * K];
-float B[K * N];
-float C[M * N];
-float D[M * N];
-
-int main() {
-
-  buffer<float, 1> bufA(A, range<1>(M * K)); // will be used as tf32
-  buffer<float, 1> bufB(B, range<1>(K * N)); // will be used as tf32
-  buffer<float, 1> bufC(C, range<1>(M * N));
-  buffer<float, 1> bufD(D, range<1>(M * N));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    auto accA = bufA.get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.get_access<access::mode::read_write>(cgh);
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.get_access<access::mode::read_write>(cgh);
-
-    cgh.parallel_for<class row_row>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, precision::tf32, use::a, M, K,
-                       layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, precision::tf32, use::b, K, N,
-                       layout::row_major>
-              sub_b{};
-          joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
-
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-
-          auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
-          //CHECK-OPAQUE: tail call i32 @llvm.nvvm.f2tf32.rna(float %{{.*}})
-          joint_matrix_apply(sg, sub_a, round_lambda);
-
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-        });
-  });
-
-  q.submit([&](handler &cgh) {
-    auto accA = bufA.get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.get_access<access::mode::read_write>(cgh);
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.get_access<access::mode::read_write>(cgh);
-
-    cgh.parallel_for<class col_col>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, precision::tf32, use::a, M, K,
-                       layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, precision::tf32, use::b, K, N,
-                       layout::col_major>
-              sub_b{};
-          joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
-
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0(ptr %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) {{.*}}, i32 {{.*}})
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::col_major);
-
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+// Float is used in this test as the storage type for tf32:
+//
+// float A[M * K];
+// float B[K * N];
+// float C[M * N];
+// float D[M * N];
+//
+// Accessors would have been made, like so:
+//
+// buffer<float, 1> bufA(A, range<1>(M * K)); // will be used as tf32
+// buffer<float, 1> bufB(B, range<1>(K * N)); // will be used as tf32
+// buffer<float, 1> bufC(C, range<1>(M * N));
+// buffer<float, 1> bufD(D, range<1>(M * N));
+// ...
+// auto accA = bufA.get_access<access::mode::read_write>(handler);
+// auto accB = bufB.get_access<access::mode::read_write>(handler);
+// auto accC = bufC.get_access<access::mode::read_write>(handler);
+// auto accD = bufD.get_access<access::mode::read_write>(handler);
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row(sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accA,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accB,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accC,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accD,
+        nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, precision::tf32, use::a, M, K, layout::row_major>
+      sub_a{};
+  joint_matrix<sub_group, precision::tf32, use::b, K, N, layout::row_major>
+      sub_b{};
+  joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
+
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+
+  auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
+  //CHECK-OPAQUE: tail call i32 @llvm.nvvm.f2tf32.rna(float %{{.*}})
+  joint_matrix_apply(sg, sub_a, round_lambda);
+
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col(sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accA,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accB,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accC,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accD,
+        nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, precision::tf32, use::a, M, K, layout::col_major>
+      sub_a{};
+  joint_matrix<sub_group, precision::tf32, use::b, K, N, layout::col_major>
+      sub_b{};
+  joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
+
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0(ptr %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) {{.*}}, i32 {{.*}})
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::col_major);
+
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
index 67d0dd5ea4728..2a6dfd700fd0a 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
@@ -9,215 +9,231 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<uint8_t, 1> bufA(nullptr, range<1>(1));
-  buffer<uint8_t, 1> bufB(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
index 37253e48e6554..cb6f09e201d92 100644
--- a/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
+++ b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
@@ -6,64 +6,47 @@
 
 using namespace sycl;
 
-int main() {
-
-  queue q;
-
-  float input[2];
-  float res[13];
-  {
-    buffer<float, 1> input_buff(&input[0], range<1>(2));
-    buffer<float, 1> res_buff(&res[0], range<1>(13));
-    q.submit([&](handler &cgh) {
-      accessor<float, 1, access::mode::write, target::device> res_acc(res_buff,
-                                                                      cgh);
-      accessor<float, 1, access::mode::read, target::device> input_acc(
-          input_buff, cgh);
-      cgh.single_task([=]() {
-        // CHECK: tail call noundef float @llvm.nvvm.cos.approx.f
-        res_acc[0] = sycl::native::cos(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.sin.approx.f
-        res_acc[1] = sycl::native::sin(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[2] = sycl::native::exp2(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.lg2.approx.f
-        res_acc[3] = sycl::native::log2(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.rsqrt.approx.f
-        res_acc[4] = sycl::native::rsqrt(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.sqrt.approx.f
-        res_acc[5] = sycl::native::sqrt(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.rcp.approx.f
-        res_acc[6] = sycl::native::recip(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
-        res_acc[7] = sycl::native::divide(input_acc[0], input_acc[1]);
-
-        // Functions that use the above builtins:
-
-        // CHECK: tail call float @llvm.nvvm.sin.approx.f
-        // CHECK: tail call float @llvm.nvvm.cos.approx.f
-        // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
-        res_acc[8] = sycl::native::tan(input_acc[0]);
-        // CHECK: fmul float {{.*}}, 0x3FF7154760000000
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[9] = sycl::native::exp(input_acc[0]);
-        // CHECK: fmul float {{.*}}, 0x400A934F00000000
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[10] = sycl::native::exp10(input_acc[0]);
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, 0x3FE62E4300000000
-        res_acc[11] = sycl::native::log(input_acc[0]);
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, 0x3FD3441360000000
-        res_acc[12] = sycl::native::log10(input_acc[0]);
-
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, {{.*}}
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[13] = sycl::native::powr(input_acc[0], input_acc[1]);
-      });
-    });
-  }
-
-  return 0;
+SYCL_EXTERNAL void native_math_cuda(
+    accessor<float, 1, access::mode::write, target::device> res_acc,
+    accessor<float, 1, access::mode::read, target::device> input_acc) {
+  // CHECK: tail call noundef float @llvm.nvvm.cos.approx.f
+  res_acc[0] = sycl::native::cos(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.sin.approx.f
+  res_acc[1] = sycl::native::sin(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[2] = sycl::native::exp2(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.lg2.approx.f
+  res_acc[3] = sycl::native::log2(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.rsqrt.approx.f
+  res_acc[4] = sycl::native::rsqrt(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.sqrt.approx.f
+  res_acc[5] = sycl::native::sqrt(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.rcp.approx.f
+  res_acc[6] = sycl::native::recip(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
+  res_acc[7] = sycl::native::divide(input_acc[0], input_acc[1]);
+
+  // Functions that use the above builtins:
+
+  // CHECK: tail call float @llvm.nvvm.sin.approx.f
+  // CHECK: tail call float @llvm.nvvm.cos.approx.f
+  // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
+  res_acc[8] = sycl::native::tan(input_acc[0]);
+  // CHECK: fmul float {{.*}}, 0x3FF7154760000000
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[9] = sycl::native::exp(input_acc[0]);
+  // CHECK: fmul float {{.*}}, 0x400A934F00000000
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[10] = sycl::native::exp10(input_acc[0]);
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, 0x3FE62E4300000000
+  res_acc[11] = sycl::native::log(input_acc[0]);
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, 0x3FD3441360000000
+  res_acc[12] = sycl::native::log10(input_acc[0]);
+
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, {{.*}}
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[13] = sycl::native::powr(input_acc[0], input_acc[1]);
 };

From 4c2cbc5ac7fd941665ddd97203ad998bdf04f9dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=BDu=C5=BEek?= <peter@codeplay.com>
Date: Fri, 7 Jun 2024 12:09:39 +0200
Subject: [PATCH 38/55] [SYCL][Bindless] Enable non-Vulkan tests on Windows
 (#14045)

Bindless Images should now properly work on Windows, with the exception
of Vulkan interop, which requires extra work. Required a few fixes to
non-conformant C++ code.
---
 sycl/test-e2e/bindless_images/bindless_helpers.hpp           | 4 +++-
 sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp    | 2 +-
 sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp  | 2 +-
 sycl/test-e2e/bindless_images/device_to_device_copy.cpp      | 1 -
 sycl/test-e2e/bindless_images/image_get_info.cpp             | 1 -
 sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp      | 5 ++---
 sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp      | 5 ++---
 sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp      | 5 ++---
 sycl/test-e2e/bindless_images/read_1D.cpp                    | 1 -
 sycl/test-e2e/bindless_images/read_2D.cpp                    | 1 -
 sycl/test-e2e/bindless_images/read_2D_dynamic.cpp            | 1 -
 sycl/test-e2e/bindless_images/read_3D.cpp                    | 1 -
 sycl/test-e2e/bindless_images/read_norm_types.cpp            | 1 -
 sycl/test-e2e/bindless_images/read_sampled.cpp               | 1 -
 sycl/test-e2e/bindless_images/read_write_1D.cpp              | 1 -
 sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp    | 1 -
 sycl/test-e2e/bindless_images/read_write_2D.cpp              | 1 -
 sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp    | 1 -
 sycl/test-e2e/bindless_images/read_write_3D.cpp              | 1 -
 sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp    | 1 -
 sycl/test-e2e/bindless_images/read_write_unsampled.cpp       | 1 -
 sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp | 1 -
 sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp     | 1 -
 sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp | 1 -
 sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp     | 1 -
 sycl/test-e2e/bindless_images/sampling_1D.cpp                | 1 -
 sycl/test-e2e/bindless_images/sampling_2D.cpp                | 1 -
 sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp     | 1 -
 sycl/test-e2e/bindless_images/sampling_2D_half.cpp           | 1 -
 sycl/test-e2e/bindless_images/sampling_3D.cpp                | 1 -
 sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp | 1 -
 .../bindless_images/user_types/mipmap_read_user_type_2D.cpp  | 1 -
 .../bindless_images/user_types/read_write_user_type.cpp      | 1 -
 33 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/sycl/test-e2e/bindless_images/bindless_helpers.hpp b/sycl/test-e2e/bindless_images/bindless_helpers.hpp
index f8668d054e0ae..618c29fb06745 100644
--- a/sycl/test-e2e/bindless_images/bindless_helpers.hpp
+++ b/sycl/test-e2e/bindless_images/bindless_helpers.hpp
@@ -53,6 +53,8 @@ static void fill_rand(std::vector<sycl::vec<DType, NChannels>> &v,
       return std::uniform_real_distribution<float>(0.0, 100.0);
     } else if constexpr (std::is_floating_point_v<DType>) {
       return std::uniform_real_distribution<DType>(0.0, 100.0);
+    } else if constexpr (sizeof(DType) == 1) {
+      return std::uniform_int_distribution<unsigned short>(0, 100);
     } else {
       return std::uniform_int_distribution<DType>(0, 100);
     }
@@ -61,7 +63,7 @@ static void fill_rand(std::vector<sycl::vec<DType, NChannels>> &v,
     sycl::vec<DType, NChannels> temp;
 
     for (int j = 0; j < NChannels; j++) {
-      temp[j] = distribution(generator);
+      temp[j] = static_cast<DType>(distribution(generator));
     }
 
     v[i] = temp;
diff --git a/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp b/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
index 63d213586e4f5..71f9253b239a2 100644
--- a/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
+++ b/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: linux,cuda,aspect-ext_oneapi_cubemap
+// REQUIRES: cuda,aspect-ext_oneapi_cubemap
 // REQUIRES: aspect-ext_oneapi_cubemap_seamless_filtering
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp b/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
index 383440d7835a2..413045190e54c 100644
--- a/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
+++ b/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: linux,cuda,aspect-ext_oneapi_cubemap
+// REQUIRES: cuda,aspect-ext_oneapi_cubemap
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %t.out
diff --git a/sycl/test-e2e/bindless_images/device_to_device_copy.cpp b/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
index 3ca37772e4f5e..4a9263e44a13e 100644
--- a/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
+++ b/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/image_get_info.cpp b/sycl/test-e2e/bindless_images/image_get_info.cpp
index b9dc8d19fc1c9..e30eded427da1 100644
--- a/sycl/test-e2e/bindless_images/image_get_info.cpp
+++ b/sycl/test-e2e/bindless_images/image_get_info.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
index 003c48a7aac07..91f725dad01e3 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -160,13 +159,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
index 472f31487eded..afedf976077cf 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -167,13 +166,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
index e3d11bcb7c567..b2c5f7ae42b71 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -159,13 +158,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/read_1D.cpp b/sycl/test-e2e/bindless_images/read_1D.cpp
index 624e3d69aa8a0..c85157cf7f8b2 100644
--- a/sycl/test-e2e/bindless_images/read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_2D.cpp b/sycl/test-e2e/bindless_images/read_2D.cpp
index 3b3ff18f8421d..5c5dbb5fc59f9 100644
--- a/sycl/test-e2e/bindless_images/read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
index e5de53c8c5fa3..377e0103b56c9 100644
--- a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_3D.cpp b/sycl/test-e2e/bindless_images/read_3D.cpp
index 556a37e555e7d..b77bfaa298752 100644
--- a/sycl/test-e2e/bindless_images/read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_norm_types.cpp b/sycl/test-e2e/bindless_images/read_norm_types.cpp
index 8cb3b2117c175..aa2daa29b1bad 100644
--- a/sycl/test-e2e/bindless_images/read_norm_types.cpp
+++ b/sycl/test-e2e/bindless_images/read_norm_types.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_sampled.cpp b/sycl/test-e2e/bindless_images/read_sampled.cpp
index 7fe41261f2f5d..0972c3401b7b9 100644
--- a/sycl/test-e2e/bindless_images/read_sampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_sampled.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_1D.cpp b/sycl/test-e2e/bindless_images/read_write_1D.cpp
index d6b985b203dd9..7f366ae682039 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
index f866af948a52c..3572150cbd12f 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_2D.cpp b/sycl/test-e2e/bindless_images/read_write_2D.cpp
index 28246fb6211a7..6fa09ea4a1eea 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
index dddb9100ab85a..c227734d3a00a 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_3D.cpp b/sycl/test-e2e/bindless_images/read_write_3D.cpp
index 1bcbed6a9fe4d..efb87a8ff9f4a 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
index adbee77c2873f..6631260dc15f1 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
index 1ac11302224d2..cae8cbc32b3d2 100644
--- a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
index d8d7e6e50fb30..965dc9f00c1c4 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_1d_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
index c85852dba64fd..0a6da2d97f136 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_2d
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
index 2258e4f098494..834ec5b6e8c79 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_2d_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
index 3c534e7329f96..ccb096dbfbdc5 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_3d
 
diff --git a/sycl/test-e2e/bindless_images/sampling_1D.cpp b/sycl/test-e2e/bindless_images/sampling_1D.cpp
index b80a640aa370b..ef184c112568c 100644
--- a/sycl/test-e2e/bindless_images/sampling_1D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_2D.cpp b/sycl/test-e2e/bindless_images/sampling_2D.cpp
index 33dac190a4b68..92a26df5afc38 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
index aefd953b6f46d..52775a8b0806c 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_images_shared_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
index fc34dfcb13c50..aeb57976df5aa 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-fp16
 
diff --git a/sycl/test-e2e/bindless_images/sampling_3D.cpp b/sycl/test-e2e/bindless_images/sampling_3D.cpp
index 518b0e873d583..47d98aaf0be97 100644
--- a/sycl/test-e2e/bindless_images/sampling_3D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
index eea7f53c5c0e3..895f7082adce6 100644
--- a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
index 1e55ac489c2a3..77913a2836565 100644
--- a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
index 7a55e31e40fba..db9347f9895e6 100644
--- a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out

From 27bd7ae5acedb17478a776a585c58ba76fad1d51 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.pierre.miller@gmail.com>
Date: Fri, 7 Jun 2024 11:20:23 +0100
Subject: [PATCH 39/55] [SYCL][HIP] Remove unsupported from O0 tests on AMD
 (#13967)

This was tracked down to a bug in ROCm that seems to be fixed with newer
versions, and the CI is now on ROCm 6+ so these should be fine.

ROCm ticket: https://github.com/ROCm/clr/pull/13

The reduce over group test works on W6800 and MI210, but it seems for
gfx1031 it reports not supporting shared USM, note that HIP for gfx1031
isn't officially supported by AMD.

---------

Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
---
 .../GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp        | 4 +---
 sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp                 | 4 ----
 sycl/test-e2e/Regression/unoptimized_stream.cpp               | 3 ---
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
index cf9bb8cf5c850..ed705979ac4fe 100644
--- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
@@ -1,8 +1,6 @@
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
+// REQUIRES: aspect-usm_shared_allocations
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
index d9677e1b93f91..6f19d9f3cdf2a 100644
--- a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
+++ b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
@@ -5,10 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // RUN: %{build} -O0 -o %t.out
 
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Regression/unoptimized_stream.cpp b/sycl/test-e2e/Regression/unoptimized_stream.cpp
index 82b2b1ad0d789..12c1eea03fd03 100644
--- a/sycl/test-e2e/Regression/unoptimized_stream.cpp
+++ b/sycl/test-e2e/Regression/unoptimized_stream.cpp
@@ -1,6 +1,3 @@
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // RUN: %{build} -O0 -o %t.out
 // RUN: %{run} %t.out
 

From cad941f3c0ad883b7ce381fcb16a15b429b6fd02 Mon Sep 17 00:00:00 2001
From: Maksim Sabianin <maksim.sabianin@intel.com>
Date: Fri, 7 Jun 2024 14:34:26 +0200
Subject: [PATCH 40/55] [NFCI][SYCL] Move SYCL Module Splitting to library.
 Part 2 (#13282)

Added SYCL Module Splitting as a library. ESIMD splitting is not present
in this patch and will be added in an upcoming patch.
Added particular testing tool sycl-module-split that invokes added
functionality.

Not all `device-code-split` tests were updated in this patch because the rest
of them (mostly) don't test module splitting itself. They will be migrated in an
upcoming patch.
---
 .../include/llvm/SYCLLowerIR/ModuleSplitter.h |  30 ++++
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   1 +
 llvm/lib/SYCLLowerIR/ModuleSplitter.cpp       |  70 ++++++++++
 llvm/test/CMakeLists.txt                      |   1 +
 llvm/test/lit.cfg.py                          |   1 +
 .../device-code-split/auto-module-split-1.ll  |   7 +
 .../device-code-split/auto-module-split-2.ll  |   6 +
 .../device-code-split/auto-module-split-3.ll  |  12 ++
 .../auto-module-split-func-ptr.ll             |   6 +
 .../device-code-split/basic-module-split.ll   |   6 +
 .../complex-indirect-call-chain.ll            |  33 +++++
 .../one-kernel-per-module.ll                  |   9 ++
 .../device-code-split/per-aspect-split-1.ll   |  42 ++++++
 .../device-code-split/per-aspect-split-2.ll   |  14 ++
 .../device-code-split/per-aspect-split-3.ll   |  16 +++
 .../device-code-split/per-joint-matrix-1.ll   |  34 +++++
 .../device-code-split/per-joint-matrix-2.ll   |  34 +++++
 .../per-joint-matrix-mad-1.ll                 |  34 +++++
 .../per-joint-matrix-mad-2.ll                 |  34 +++++
 .../per-joint-matrix-mad-4.ll                 |  34 +++++
 .../per-joint-matrix-mad-5.ll                 |  90 ++++++++++++
 .../per-reqd-sub-group-size-split-1.ll        |  42 ++++++
 .../per-reqd-sub-group-size-split-2.ll        |  15 ++
 .../per-reqd-wg-size-split-1.ll               |  42 ++++++
 .../per-reqd-wg-size-split-2.ll               |  14 ++
 .../split-with-kernel-declarations.ll         |  21 ++-
 .../device-code-split/vtable.ll               |   3 +
 llvm/tools/sycl-module-split/CMakeLists.txt   |  10 ++
 .../sycl-module-split/sycl-module-split.cpp   | 130 ++++++++++++++++++
 29 files changed, 786 insertions(+), 5 deletions(-)
 create mode 100644 llvm/tools/sycl-module-split/CMakeLists.txt
 create mode 100644 llvm/tools/sycl-module-split/sycl-module-split.cpp

diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
index 9ae433cedc668..085e424249d5c 100644
--- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
+++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PropertySetIO.h"
 
 #include <memory>
 #include <string>
@@ -196,6 +197,8 @@ class ModuleDesc {
 
   ModuleDesc clone() const;
 
+  std::string makeSymbolTable() const;
+
   const SYCLDeviceRequirements &getOrComputeDeviceRequirements() const {
     if (!Reqs.has_value())
       Reqs = computeDeviceRequirements(*this);
@@ -270,6 +273,33 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
                      const char *msg = "", int Tab = 0);
 #endif // NDEBUG
 
+struct SplitModule {
+  std::string ModuleFilePath;
+  util::PropertySetRegistry Properties;
+  std::string Symbols;
+
+  SplitModule() = default;
+  SplitModule(const SplitModule &) = default;
+  SplitModule &operator=(const SplitModule &) = default;
+  SplitModule(SplitModule &&) = default;
+  SplitModule &operator=(SplitModule &&) = default;
+
+  SplitModule(std::string_view File, util::PropertySetRegistry Properties,
+              std::string Symbols)
+      : ModuleFilePath(File), Properties(std::move(Properties)),
+        Symbols(std::move(Symbols)) {}
+};
+
+struct ModuleSplitterSettings {
+  IRSplitMode Mode;
+  bool OutputAssembly = false; // Bitcode or LLVM IR.
+  StringRef OutputPrefix;
+};
+
+/// Splits the given module \p M according to the given \p Settings.
+Expected<std::vector<SplitModule>>
+splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings);
+
 } // namespace module_split
 
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 7f2edaae323a9..49b9e802e75d3 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -92,6 +92,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   LINK_COMPONENTS
   Analysis
   Core
+  Passes
   Support
   ipo
   )
diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index cf41aee46df28..fa6e12f0a07d2 100644
--- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -12,16 +12,20 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IRPrinter/IRPrintingPasses.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
@@ -733,6 +737,14 @@ void EntryPointGroup::rebuild(const Module &M) {
       Functions.insert(const_cast<Function *>(&F));
 }
 
+std::string ModuleDesc::makeSymbolTable() const {
+  std::string ST;
+  for (const Function *F : EntryPoints.Functions)
+    ST += (Twine(F->getName()) + "\n").str();
+
+  return ST;
+}
+
 namespace {
 // This is a helper class, which allows to group/categorize function based on
 // provided rules. It is intended to be used in device code split
@@ -1143,5 +1155,63 @@ SmallVector<ModuleDesc, 2> splitByESIMD(ModuleDesc &&MD,
   return Result;
 }
 
+static Error saveModuleIRInFile(Module &M, StringRef FilePath,
+                                bool OutputAssembly) {
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(FilePath, FD))
+    return errorCodeToError(EC);
+
+  raw_fd_ostream OS(FD, true);
+  ModulePassManager MPM;
+  ModuleAnalysisManager MAM;
+  PassBuilder PB;
+  PB.registerModuleAnalyses(MAM);
+  if (OutputAssembly)
+    MPM.addPass(PrintModulePass(OS));
+  else
+    MPM.addPass(BitcodeWriterPass(OS));
+
+  MPM.run(M, MAM);
+  return Error::success();
+}
+
+static Expected<SplitModule> saveModuleDesc(ModuleDesc &MD, std::string Prefix,
+                                            bool OutputAssembly) {
+  SplitModule SM;
+  Prefix += OutputAssembly ? ".ll" : ".bc";
+  Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly);
+  if (E)
+    return E;
+
+  SM.ModuleFilePath = Prefix;
+  SM.Symbols = MD.makeSymbolTable();
+  return SM;
+}
+
+Expected<std::vector<SplitModule>>
+splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings) {
+  ModuleDesc MD = std::move(M); // makeModuleDesc() ?
+  // FIXME: false arguments are temporary for now.
+  auto Splitter =
+      getDeviceCodeSplitter(std::move(MD), Settings.Mode, false, false);
+  size_t ID = 0;
+  std::vector<SplitModule> OutputImages;
+  while (Splitter->hasMoreSplits()) {
+    ModuleDesc MD2 = Splitter->nextSplit();
+    MD2.fixupLinkageOfDirectInvokeSimdTargets();
+
+    std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str();
+    auto SplittedImageOrErr =
+        saveModuleDesc(MD2, OutIRFileName, Settings.OutputAssembly);
+    if (!SplittedImageOrErr)
+      return SplittedImageOrErr.takeError();
+
+    OutputImages.emplace_back(std::move(*SplittedImageOrErr));
+    ++ID;
+  }
+
+  return OutputImages;
+}
+
 } // namespace module_split
 } // namespace llvm
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index e73de4c64b271..7a425f5c5d038 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -146,6 +146,7 @@ set(LLVM_TEST_DEPENDS
           sanstats
           spirv-to-ir-wrapper
           sycl-post-link
+          sycl-module-split
           split-file
           verify-uselistorder
           yaml-bench
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index ae578d55777fe..f44fcdfda93b1 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -244,6 +244,7 @@ def get_asan_rtlib():
         "sanstats",
         "llvm-remarkutil",
         "spirv-to-ir-wrapper",
+        "sycl-module-split",
     ]
 )
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
index 09261f7f61088..0583cfde3af23 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
@@ -5,6 +5,13 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; By default auto mode is equal to source mode
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
+
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
index e911800bf429a..4ff2095f42bbb 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
@@ -10,6 +10,12 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
+
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
index f5915c7ac57b6..a5c62a5912338 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
@@ -14,6 +14,18 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM
 ;
+;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+;
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0-IR \
+; RUN:     --implicit-check-not TU0_kernel --implicit-check-not _Z3foov \
+; RUN:     --implicit-check-not _Z4foo3v
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1-IR \
+; RUN:     --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v \
+; RUN:     --implicit-check-not _Z4foo1v
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-SYM
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-SYM
+
 ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0
 ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1
 ;
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
index 458485bf53aa6..730d9a5cd8efc 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
@@ -4,6 +4,12 @@
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix=CHECK-SYM0
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix=CHECK-SYM1
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix=CHECK-IR0
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix=CHECK-IR1
+
 ; This test checkes that we can properly perform device code split by tracking
 ; all uses of functions (not only direct calls)
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
index 2a86625eeb27e..48d58248d0095 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
@@ -3,6 +3,12 @@
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
+
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
 ; ModuleID = 'basic-module-split.ll'
 source_filename = "basic-module-split.ll"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
index d26f97f9d70a0..064471405a58d 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
@@ -12,6 +12,17 @@
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
 ;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
 ; RUN: sycl-post-link -split=source -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
 ; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
@@ -23,6 +34,17 @@
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
 ;
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
 ; RUN: sycl-post-link -split=kernel -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
 ; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
@@ -33,6 +55,17 @@
 ; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
 
 ; CHECK0-DAG: define spir_kernel void @kernel_C
 ; CHECK0-DAG: define spir_func i32 @bar
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
index 715929861b356..0197a2edd4a1b 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
@@ -5,6 +5,15 @@
 ; RUN: FileCheck %s -input-file=%t.files_1.sym --check-prefixes CHECK-MODULE1-TXT
 ; RUN: FileCheck %s -input-file=%t.files_2.ll --check-prefixes CHECK-MODULE2,CHECK
 ; RUN: FileCheck %s -input-file=%t.files_2.sym --check-prefixes CHECK-MODULE2-TXT
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2.files
+; RUN: FileCheck %s -input-file=%t2.files_0.ll --check-prefixes CHECK-MODULE0,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_0.sym --check-prefixes CHECK-MODULE0-TXT
+; RUN: FileCheck %s -input-file=%t2.files_1.ll --check-prefixes CHECK-MODULE1,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_1.sym --check-prefixes CHECK-MODULE1-TXT
+; RUN: FileCheck %s -input-file=%t2.files_2.ll --check-prefixes CHECK-MODULE2,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_2.sym --check-prefixes CHECK-MODULE2-TXT
+
 ; ModuleID = 'one-kernel-per-module.ll'
 source_filename = "one-kernel-per-module.ll"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
index faec71a602ffd..51a2895f4d326 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
index 773424fa91fcb..f4d66822b261c 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
@@ -15,6 +15,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel3 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
index 5c1f743997816..523477a07573b 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
@@ -17,6 +17,22 @@
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK-M1-IR \
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not bar
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel1
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel0
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not foo \
+; RUN:     --implicit-check-not bar
+;
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK-M1-IR \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not bar
+
 ; We expect to see 3 modules generated:
 ;
 ; CHECK-TABLE: Code
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
index 282a0dd0dc79e..543a892415fa4 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
@@ -15,6 +15,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -25,6 +35,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -39,6 +59,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
index 5472093bda677..6c054fc579659 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
index c85f636459fa2..fd64b234b2c6f 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
@@ -15,6 +15,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -25,6 +35,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -39,6 +59,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
index f13f9caf01ed7..4c4a4bc8a1a6e 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
index f4a312ded5c7e..fe995542deba1 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
index b33aba9a2ad06..25fd2e26f3ca4 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
@@ -33,6 +33,32 @@
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
 ; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
@@ -59,6 +85,32 @@
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
 ; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K6 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
@@ -97,6 +149,44 @@
 ; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
 ; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_4.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_5.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_4.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_5.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
index cd890e158c734..393943b63db43 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
index 155b843c390a5..1efeb364cb2e3 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
@@ -16,6 +16,21 @@
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
 ; RUN:     --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
+; RUN:     --implicit-check-not kernel3
+
+;
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
index fa5ffe782a7db..b156d71b1e3f6 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
index cb38a596a7ba9..c92ae8dbc9c03 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
@@ -15,6 +15,20 @@
 ; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
 ; RUN:     --implicit-check-not kernel0
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel3 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
+; RUN:     --implicit-check-not kernel0
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
index 595427a786e7b..82213e4b3beeb 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
@@ -6,11 +6,22 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-PER-SOURCE-SYM0
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-PER-SOURCE-SYM1
 ;
-; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t1.table
-; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-KERNEL-TABLE
-; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
-; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
-; RUN: FileCheck %s -input-file=%t1_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
+; RUN: sycl-module-split -split=source -S < %s -o %t1
+; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-SOURCE-TABLE
+; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-SOURCE-SYM0
+; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-SOURCE-SYM1
+;
+; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t2.table
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t3
+; RUN: FileCheck %s -input-file=%t3.table --check-prefix CHECK-PER-KERNEL-TABLE
+; RUN: FileCheck %s -input-file=%t3_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
+; RUN: FileCheck %s -input-file=%t3_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
+; RUN: FileCheck %s -input-file=%t3_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
 
 ; With per-source split, there should be two device images
 ; CHECK-PER-SOURCE-TABLE: [Code|Properties|Symbols]
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
index 02d289fa772e0..cb9fd1f77cf78 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
@@ -42,6 +42,9 @@
 ; RUN: sycl-post-link -split=auto -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll
 ;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll
+;
 ; CHECK-DAG: @_ZTV8Derived1 = {{.*}} @_ZN8Derived17displayEv
 ; CHECK-DAG: @_ZTV8Derived2 = {{.*}} @_ZN8Derived27displayEv
 ;
diff --git a/llvm/tools/sycl-module-split/CMakeLists.txt b/llvm/tools/sycl-module-split/CMakeLists.txt
new file mode 100644
index 0000000000000..0c29be481e538
--- /dev/null
+++ b/llvm/tools/sycl-module-split/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  IRReader
+  Support
+  SYCLLowerIR
+  )
+
+add_llvm_tool(sycl-module-split
+  sycl-module-split.cpp
+  )
diff --git a/llvm/tools/sycl-module-split/sycl-module-split.cpp b/llvm/tools/sycl-module-split/sycl-module-split.cpp
new file mode 100644
index 0000000000000..89d8b9e10b2b7
--- /dev/null
+++ b/llvm/tools/sycl-module-split/sycl-module-split.cpp
@@ -0,0 +1,130 @@
+//==-- sycl-module-split: command line tool for testing SYCL Module Splitting //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This program can be used only to test the SYCL Module Splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/PropertySetIO.h"
+#include "llvm/Support/SimpleTable.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::util;
+using namespace module_split;
+
+static cl::OptionCategory SplitCategory("Split options");
+
+static cl::opt<std::string> InputFilename(cl::Positional, cl::desc(""),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    OutputFilenamePrefix("o", cl::desc("output filename prefix"),
+                         cl::value_desc("filename prefix"), cl::init("output"),
+                         cl::cat(SplitCategory));
+
+cl::opt<bool> OutputAssembly{"S", cl::desc("Write output as LLVM assembly"),
+                             cl::cat(SplitCategory)};
+
+cl::opt<IRSplitMode> SplitMode(
+    "split", cl::desc("split input module"), cl::Optional, cl::init(SPLIT_NONE),
+    cl::values(clEnumValN(module_split::SPLIT_PER_TU, "source",
+                          "1 output module per source (translation unit)"),
+               clEnumValN(module_split::SPLIT_PER_KERNEL, "kernel",
+                          "1 output module per kernel"),
+               clEnumValN(module_split::SPLIT_AUTO, "auto",
+                          "Choose split mode automatically")),
+    cl::cat(SplitCategory));
+
+void writeStringToFile(const std::string &Content, StringRef Path) {
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  OS << Content << "\n";
+}
+
+void writePropertiesToFile(const PropertySetRegistry &Properties,
+                           StringRef Path) {
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  Properties.write(OS);
+}
+
+void dumpModulesAsTable(const std::vector<SplitModule> &SplitModules,
+                        StringRef Path) {
+  std::vector<StringRef> Columns = {"Code", "Properties", "Symbols"};
+  auto TableOrErr = SimpleTable::create(Columns);
+  if (!TableOrErr) {
+    errs() << "can't create a table\n";
+    exit(1);
+  }
+
+  std::unique_ptr<SimpleTable> Table = std::move(*TableOrErr);
+  for (const auto &[I, SM] : enumerate(SplitModules)) {
+    std::string SymbolsFile = (Twine(Path) + "_" + Twine(I) + ".sym").str();
+    std::string PropertiesFile = (Twine(Path) + "_" + Twine(I) + ".prop").str();
+    writePropertiesToFile(SM.Properties, PropertiesFile);
+    writeStringToFile(SM.Symbols, SymbolsFile);
+    SmallVector<StringRef, 3> Row = {SM.ModuleFilePath, PropertiesFile,
+                                     SymbolsFile};
+    Table->addRow(Row);
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS((Path + ".table").str(), EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  Table->write(OS);
+}
+
+int main(int argc, char *argv[]) {
+  LLVMContext C;
+  SMDiagnostic Err;
+  cl::ParseCommandLineOptions(argc, argv, "SYCL Module Splitter\n");
+
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, C);
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  ModuleSplitterSettings Settings;
+  Settings.Mode = SplitMode;
+  Settings.OutputAssembly = OutputAssembly;
+  Settings.OutputPrefix = OutputFilenamePrefix;
+  auto SplitModulesOrErr = splitSYCLModule(std::move(M), Settings);
+  if (!SplitModulesOrErr) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  dumpModulesAsTable(*SplitModulesOrErr, OutputFilenamePrefix);
+}

From 572aa5c190a6fdeedd02188b1f90082c7b4f5365 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Fri, 7 Jun 2024 14:47:08 +0100
Subject: [PATCH 41/55] [SYCL][COMPAT] Add math `extend_v*2` to SYCLCompat
 (#13953)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds math `extend_v*2` operators _(18 in total)_ along with
unit-tests for signed and unsigned `int32` cases.

---------

Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Co-authored-by: Joe Todd <joe.todd@codeplay.com>
---
 sycl/doc/syclcompat/README.md                 | 240 ++++++++++-
 sycl/include/syclcompat/math.hpp              | 354 +++++++++++++++-
 .../syclcompat/math/math_extend_v.cpp         | 395 ++++++++++++++++++
 3 files changed, 977 insertions(+), 12 deletions(-)
 create mode 100644 sycl/test-e2e/syclcompat/math/math_extend_v.cpp

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 4bdcb93206f46..1d37700659028 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1771,7 +1771,7 @@ struct sub_sat {
 } // namespace syclcompat
 ```
 
-Finally, the math header provides a set of functions to extend 32-bit operations
+The math header provides a set of functions to extend 32-bit operations
 to 33 bit, and handle sign extension internally. There is support for `add`,
 `sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads
 to include a second, separate, `BinaryOperation` after the first, and include
@@ -1855,6 +1855,244 @@ inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
                                      BinaryOperation second_op);
 ```
 
+Another set of vectorized extend 32-bit operations is provided in the math 
+header.These APIs treat each of the 32-bit operands as 2-elements vector 
+(16-bits each) while handling sign extension to 17-bits internally. There is 
+support for `add`, `sub`, `absdiff`, `min`, `max` and `avg` binary operations. 
+Each operation provides has a `_sat` variat which determines if the returning 
+value is saturated or not, and a `_add` variant that computes the binary sum 
+of the the initial operation outputs and a third operand. 
+
+```cpp
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend average maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c);
+```
+
 ## Sample Code
 
 Below is a simple linear algebra sample, which computes `y = mx + b` implemented
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 5b742573b8db0..c35cb2db214b0 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -79,20 +79,43 @@ class vectorized_binary {
 };
 
 /// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
-/// extend for signed int.
+/// extend for signed int. Returns a signed integer type.
 template <typename ValueT>
-inline int64_t zero_or_signed_extent(ValueT val, unsigned bit) {
-  if constexpr (std::is_signed_v<ValueT>) {
-    return int64_t(val) << (64 - bit) >> (64 - bit);
+inline auto zero_or_signed_extend(ValueT val, unsigned bit) {
+  static_assert(std::is_integral_v<ValueT>);
+  if constexpr (sizeof(ValueT) == 4) {
+    assert(bit < 64 &&
+           "When extending int32 value, bit must be smaller than 64.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int64_t(val) << (64 - bit) >> (64 - bit);
+    else
+      return int64_t(val);
+  } else if constexpr (sizeof(ValueT) == 2) {
+    assert(bit < 32 &&
+           "When extending int16 value, bit must be smaller than 32.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int32_t(val) << (32 - bit) >> (32 - bit);
+    else
+      return int32_t(val);
+  } else if constexpr (sizeof(ValueT) == 1) {
+    assert(bit < 16 &&
+           "When extending int8 value, bit must be smaller than 16.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int16_t(val) << (16 - bit) >> (16 - bit);
+    else
+      return int16_t(val);
+  } else {
+    static_assert(sizeof(ValueT) == 8);
+    assert(bit < 64 && "Cannot extend int64 value.");
+    return static_cast<int64_t>(val);
   }
-  return val;
 }
 
 template <typename RetT, bool needSat, typename AT, typename BT,
           typename BinaryOperation>
 inline constexpr RetT extend_binary(AT a, BT b, BinaryOperation binary_op) {
-  const int64_t extend_a = zero_or_signed_extent(a, 33);
-  const int64_t extend_b = zero_or_signed_extent(b, 33);
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
   const int64_t ret = binary_op(extend_a, extend_b);
   if constexpr (needSat)
     return detail::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
@@ -105,18 +128,55 @@ template <typename RetT, bool needSat, typename AT, typename BT, typename CT,
 inline constexpr RetT extend_binary(AT a, BT b, CT c,
                                     BinaryOperation1 binary_op,
                                     BinaryOperation2 second_op) {
-  const int64_t extend_a = zero_or_signed_extent(a, 33);
-  const int64_t extend_b = zero_or_signed_extent(b, 33);
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
   int64_t extend_temp =
-      zero_or_signed_extent(binary_op(extend_a, extend_b), 34);
+      zero_or_signed_extend(binary_op(extend_a, extend_b), 34);
   if constexpr (needSat)
     extend_temp =
         detail::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
                                std::numeric_limits<RetT>::max());
-  const int64_t extend_c = zero_or_signed_extent(c, 33);
+  const int64_t extend_c = zero_or_signed_extend(c, 33);
   return second_op(extend_temp, extend_c);
 }
 
+template <typename T> sycl::vec<int32_t, 2> extractAndExtend2(T a) {
+  sycl::vec<int32_t, 2> ret;
+  sycl::vec<T, 1> va{a};
+  using Tint =
+      typename std::conditional<std::is_signed_v<T>, int16_t, uint16_t>::type;
+  auto v = va.template as<sycl::vec<Tint, 2>>();
+  ret[0] = zero_or_signed_extend(v[0], 17);
+  ret[1] = zero_or_signed_extend(v[1], 17);
+  return ret;
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_vbinary2(AT a, BT b, RetT c,
+                                      BinaryOperation binary_op) {
+  static_assert(std::is_integral_v<AT> && std::is_integral_v<BT> &&
+                std::is_integral_v<RetT> && sizeof(AT) == 4 &&
+                sizeof(BT) == 4 && sizeof(RetT) == 4);
+  sycl::vec<int32_t, 2> extend_a = extractAndExtend2(a);
+  sycl::vec<int32_t, 2> extend_b = extractAndExtend2(b);
+  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
+                             binary_op(extend_a[1], extend_b[1])};
+  using Tint = typename std::conditional<std::is_signed_v<RetT>, int16_t,
+                                         uint16_t>::type;
+
+  if constexpr (NeedSat) {
+    int32_t min_val = 0, max_val = 0;
+    min_val = std::numeric_limits<Tint>::min();
+    max_val = std::numeric_limits<Tint>::max();
+    temp = detail::clamp(temp, {min_val, min_val}, {max_val, max_val});
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + c;
+  }
+  return sycl::vec<Tint, 2>{temp[0], temp[1]}.template as<sycl::vec<RetT, 1>>();
+}
+
 template <typename ValueT> inline bool isnan(const ValueT a) {
   return sycl::isnan(a);
 }
@@ -712,6 +772,13 @@ struct shift_right {
     return x >> offset;
   }
 };
+
+struct average {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return (x + y + (x + y >= 0)) >> 1;
+  }
+};
+
 } // namespace detail
 
 /// Compute vectorized binary operation value for two values, with each value
@@ -1222,4 +1289,269 @@ inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b, uint32_t c,
                                            detail::shift_right(), second_op);
 }
 
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c,
+                                                     detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend average maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, detail::average());
+}
+
 } // namespace syclcompat
diff --git a/sycl/test-e2e/syclcompat/math/math_extend_v.cpp b/sycl/test-e2e/syclcompat/math/math_extend_v.cpp
new file mode 100644
index 0000000000000..6b079422a6156
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_extend_v.cpp
@@ -0,0 +1,395 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_extend_v.cpp
+ *
+ *  Description:
+ *    math extend-vectorized helpers tests
+ **************************************************************************/
+
+// ===----------- math_extend_vfunc[2/4].cpp ---------- -*- C++ -*
+// --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <stdio.h>
+#include <sycl/detail/core.hpp>
+
+#include <syclcompat/device.hpp>
+#include <syclcompat/math.hpp>
+#include <syclcompat/memory.hpp>
+
+#define CHECK(S, REF)                                                          \
+  {                                                                            \
+    auto ret = S;                                                              \
+    if (ret != REF) {                                                          \
+      return {#S, REF};                                                        \
+    }                                                                          \
+  }
+
+std::pair<const char *, int> vadd2() {
+  CHECK(syclcompat::extend_vadd2<int32_t>(0x0001FFFF, 0x00010005, 0),
+        0x00020004);
+  CHECK(syclcompat::extend_vadd2<int32_t>(0x7FFF7FFF, 0x00010001, 0),
+        0x80008000);
+  CHECK(syclcompat::extend_vadd2_sat<int32_t>(0x7FFF7FFF, 0x00010001, 0),
+        0x7FFF7FFF);
+
+  CHECK(syclcompat::extend_vadd2<uint32_t>(0x00010002, 0x00020003, 0),
+        0x00030005);
+  CHECK(syclcompat::extend_vadd2<uint32_t>(0xFFFEFFFF, 0x00030003, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vadd2_sat<uint32_t>((uint32_t)0xFFFEFFFF,
+                                               (uint32_t)0x00030003, 0),
+        0xFFFFFFFF);
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub2() {
+
+  CHECK(syclcompat::extend_vsub2<int32_t>(0x0001FFFF, 0xFFFF0001, 0),
+        0x0002FFFE);
+  // Testing API & Saturated API with mixed types
+  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
+                                          (int32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
+                                          (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
+                                          (int32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
+                                          (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, 0),
+        0x7FFF8000);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, 0),
+        0x80058000);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, 0),
+        0x7FFF7FFE);
+
+  CHECK(syclcompat::extend_vsub2<uint32_t>(0x0002000B, 0x0001000A, 0),
+        0x00010001);
+  CHECK(syclcompat::extend_vsub2<uint32_t>((uint32_t)0x00010001,
+                                           (uint32_t)0x0002FFFF, 0),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2<uint32_t>((int32_t)0x00010001,
+                                           (int32_t)0x0002FFFF, 0),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((uint32_t)0x00010001,
+                                               (uint32_t)0x0002FFFF, 0),
+        0x00000000);
+  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((int32_t)0x00010001,
+                                               (int32_t)0x0002FFFF, 0),
+        0x00000002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vadd2_add() {
+
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00010002, 0x00030004, 1),
+        0x0000000B);
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x0001FFFF, 0x0002FFFE, -1),
+        0xFFFFFFFF);
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00017FFF, 0x00017FFF, 1),
+        0x00010001);
+
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x00010002, 0x00030004, 1),
+        0x0000000B);
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>((uint32_t)0x0001FFFF,
+                                               (uint32_t)0x0002FFFF, 1),
+        0x00020002);
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x0001FFFF, 0x0002FFFF, 1),
+        0x00000002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub2_add() {
+
+  // Testing API with mixed types
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x0001FFFF,
+                                              (int32_t)0xFFFF0001, 1),
+        1);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, -1),
+        0x00000002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, -1),
+        0x00000002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, -1),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, -1),
+        0x00010002);
+
+  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x0002000B, 0x0001000A, 1),
+        0x00000003);
+  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x00010001, 0x0002FFFF, 3),
+        0x00000004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff2() {
+
+  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0xFFFF0001,
+                                              (int32_t)0x0003FFFF, 0),
+        0x00040002);
+  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0x80000002,
+                                              (int32_t)0x00010001, 0),
+        0x80010001);
+  CHECK(syclcompat::extend_vabsdiff2_sat<int32_t>((int32_t)0x80000002,
+                                                  (int32_t)0x00010001, 0),
+        0x7FFF0001);
+
+  CHECK(syclcompat::extend_vabsdiff2<uint32_t>(0x00010004, 0x00030002, 0),
+        0x00020002);
+  CHECK(syclcompat::extend_vabsdiff2<uint32_t>((uint32_t)0xFFFF0001,
+                                               (int32_t)0xFFFE0003, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vabsdiff2_sat<uint32_t>((uint32_t)0xFFFF0001,
+                                                   (int32_t)0xFFFE0003, 0),
+        0xFFFF0002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff2_add() {
+
+  CHECK(syclcompat::extend_vabsdiff2_add<int32_t>((int32_t)0xFFFF0001,
+                                                  (int32_t)0x0003FFFF, -2),
+        0x00000004);
+
+  CHECK(syclcompat::extend_vabsdiff2_add<uint32_t>(0x000A000C, 0x000B000A, 1),
+        0x00000004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin2() {
+
+  CHECK(syclcompat::extend_vmin2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
+        (int32_t)0xFFFF0001);
+  CHECK(syclcompat::extend_vmin2_sat<int32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x0001FFF1);
+
+  CHECK(syclcompat::extend_vmin2<uint32_t>(0x000A000D, 0x000B000C, 0),
+        0x000A000C);
+  CHECK(syclcompat::extend_vmin2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x00010000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmax2() {
+
+  CHECK(syclcompat::extend_vmax2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vmax2_sat<int32_t>(0x80008000, 0x00018001, 0),
+        0x7FFF7FFF);
+
+  CHECK(syclcompat::extend_vmax2<uint32_t>(0x000A000D, 0x000B000C, 0),
+        0x000B000D);
+  CHECK(syclcompat::extend_vmax2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x00020000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin2_vmax2_add() {
+
+  CHECK(
+      syclcompat::extend_vmin2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001, 2),
+      0x00000002);
+  CHECK(syclcompat::extend_vmin2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
+        0x00000018);
+
+  CHECK(syclcompat::extend_vmax2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001,
+                                              -2),
+        0x00000001);
+  CHECK(syclcompat::extend_vmax2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
+        0x0000001A);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg2() {
+
+  CHECK(syclcompat::extend_vavrg2<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA, 0),
+        0x0002FFF8);
+  CHECK(syclcompat::extend_vavrg2_sat<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
+                                               0),
+        0x0002FFF8);
+
+  CHECK(syclcompat::extend_vavrg2<uint32_t>(0x00010006, 0x00030001, 0),
+        0x00020004);
+  CHECK(syclcompat::extend_vavrg2_sat<uint32_t>(0x00010006, 0x00030001, 0),
+        0x00020004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg2_add() {
+
+  CHECK(syclcompat::extend_vavrg2_add<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
+                                               -2),
+        0xFFFFFFF8);
+
+  CHECK(syclcompat::extend_vavrg2_add<uint32_t>(0x00010006, 0x00030002, 2),
+        0x00000008);
+
+  return {nullptr, 0};
+}
+
+void test(const sycl::stream &s, int *ec) {
+  {
+    auto res = vadd2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 1;
+      return;
+    }
+    s << "vadd2 check passed!\n";
+  }
+  {
+    auto res = vsub2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 2;
+      return;
+    }
+    s << "vsub2 check passed!\n";
+  }
+  {
+    auto res = vadd2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 3;
+      return;
+    }
+    s << "vadd2_add check passed!\n";
+  }
+  {
+    auto res = vsub2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 4;
+      return;
+    }
+    s << "vsub2_add check passed!\n";
+  }
+  {
+    auto res = vabsdiff2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 5;
+      return;
+    }
+    s << "vabsdiff2 check passed!\n";
+  }
+  {
+    auto res = vmin2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 6;
+      return;
+    }
+    s << "vmin2 check passed!\n";
+  }
+  {
+    auto res = vmax2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 7;
+      return;
+    }
+    s << "vmax2 check passed!\n";
+  }
+  {
+    auto res = vmin2_vmax2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 8;
+      return;
+    }
+    s << "vmin2_add/vmax2_add check passed!\n";
+  }
+  {
+    auto res = vavrg2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 9;
+      return;
+    }
+    s << "vavrg2 check passed!\n";
+  }
+  {
+    auto res = vavrg2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 10;
+      return;
+    }
+    s << "vavrg2_add check passed!\n";
+  }
+  {
+    auto res = vabsdiff2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 11;
+      return;
+    }
+    s << "vabsdiff2_add check passed!\n";
+  }
+  *ec = 0;
+}
+
+int main() {
+  sycl::queue q = syclcompat::get_default_queue();
+  int *ec = syclcompat::malloc<int>(1);
+  syclcompat::fill<int>(ec, 0, 1);
+  q.submit([&](sycl::handler &cgh) {
+    sycl::stream out(1024, 256, cgh);
+    cgh.parallel_for(1, [=](sycl::item<1> it) { test(out, ec); });
+  });
+  q.wait_and_throw();
+
+  int ec_h;
+  syclcompat::memcpy<int>(&ec_h, ec, 1);
+
+  return ec_h;
+}

From 990b1d1ba053d60a803ae5e750803ae6583119f9 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Fri, 7 Jun 2024 07:03:31 -0700
Subject: [PATCH 42/55] [ESIMD]Replace use of vc intrinsic with spirv extension
 for rdtsc API (#13536)

---
 clang/lib/Driver/ToolChains/Clang.cpp                    | 1 +
 clang/test/Driver/sycl-spirv-ext.c                       | 2 +-
 sycl/include/CL/__spirv/spirv_ops.hpp                    | 1 +
 sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp | 2 --
 sycl/include/sycl/ext/intel/esimd/math.hpp               | 7 +++++--
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 77c94ea60d315..5338b28b854b4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -10428,6 +10428,7 @@ static void getOtherSPIRVTransOpts(Compilation &C,
       ",+SPV_INTEL_fpga_invocation_pipelining_attributes"
       ",+SPV_INTEL_fpga_latency_control"
       ",+SPV_INTEL_task_sequence"
+      ",+SPV_KHR_shader_clock"
       ",+SPV_INTEL_bindless_images";
   ExtArg = ExtArg + DefaultExtArg + INTELExtArg;
   if (C.getDriver().IsFPGAHWMode())
diff --git a/clang/test/Driver/sycl-spirv-ext.c b/clang/test/Driver/sycl-spirv-ext.c
index a306b9eb1ea4d..eb4d24197b1af 100644
--- a/clang/test/Driver/sycl-spirv-ext.c
+++ b/clang/test/Driver/sycl-spirv-ext.c
@@ -48,6 +48,7 @@
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_invocation_pipelining_attributes
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_latency_control
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_task_sequence
+// CHECK-DEFAULT-SAME:,+SPV_KHR_shader_clock
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_bindless_images
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_token_type
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_bfloat16_conversion
@@ -125,4 +126,3 @@
 // CHECK-CPU-SAME:,+SPV_KHR_non_semantic_info
 // CHECK-CPU-SAME:,+SPV_KHR_cooperative_matrix
 // CHECK-CPU-SAME:,+SPV_INTEL_fp_max_error"
-
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 6fd4b9ebf63db..18a22c0fb04a3 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -1280,6 +1280,7 @@ __CLC_BF16_SCAL_VEC(uint32_t)
 
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInGlobalHWThreadIDINTEL();
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInSubDeviceIDINTEL();
+extern __DPCPP_SYCL_EXTERNAL uint64_t __spirv_ReadClockKHR(int);
 
 template <typename from, typename to>
 extern __DPCPP_SYCL_EXTERNAL
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
index 2f6584a4bd640..d8022f48a9a1d 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
@@ -244,8 +244,6 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N)
     __esimd_ssdp4a_sat(__ESIMD_raw_vec_t(T2, N) src0,
                        __ESIMD_raw_vec_t(T3, N) src1,
                        __ESIMD_raw_vec_t(T4, N) src2) __ESIMD_INTRIN_END;
-__ESIMD_INTRIN __ESIMD_raw_vec_t(uint32_t, 4)
-    __esimd_timestamp() __ESIMD_INTRIN_END;
 
 template <typename T0, typename T1, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp
index 096c33a2fda93..67bcaace80673 100644
--- a/sycl/include/sycl/ext/intel/esimd/math.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/math.hpp
@@ -1844,8 +1844,11 @@ __ESIMD_API uint32_t subb(uint32_t &borrow, uint32_t src0, uint32_t src1) {
 /// rdtsc - get the value of timestamp counter.
 /// @return the current value of timestamp counter
 __ESIMD_API uint64_t rdtsc() {
-  __ESIMD_NS::simd<uint32_t, 4> retv = __esimd_timestamp();
-  return retv.template bit_cast_view<uint64_t>()[0];
+#ifdef __SYCL_DEVICE_ONLY__
+  return __spirv_ReadClockKHR(0);
+#else
+  __ESIMD_UNSUPPORTED_ON_HOST;
+#endif
 }
 
 /// @} sycl_esimd_math

From 141d7233de98926461c1626c6c024caa0a978eb9 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Fri, 7 Jun 2024 16:07:03 +0200
Subject: [PATCH 43/55] [SYCL][Docs] Move sycl_ext_oneapi_enqueue_functions to
 experimental (#14017)

https://github.com/intel/llvm/pull/13512 implemented the
sycl_ext_oneapi_enqueue_functions extension. Following this, the
corresponding extension document is moved to experimental and the
feature test macro is defined.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .../sycl_ext_oneapi_enqueue_functions.asciidoc  | 17 +++++++++--------
 sycl/source/feature_test.hpp.in                 |  1 +
 2 files changed, 10 insertions(+), 8 deletions(-)
 rename sycl/doc/extensions/{proposed => experimental}/sycl_ext_oneapi_enqueue_functions.asciidoc (97%)

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
similarity index 97%
rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
index ed85566c99fbc..70898ecf61a10 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
@@ -44,11 +44,12 @@ SYCL specification refer to that revision.
 
 == Status
 
-This is a proposed extension specification, intended to gather community
-feedback.  Interfaces defined in this specification may not be implemented yet
-or may be in a preliminary state.  The specification itself may also change in
-incompatible ways before it is finalized.  *Shipping software products should
-not rely on APIs defined in this specification.*
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
 
 
 == Overview
@@ -79,7 +80,7 @@ This extension makes SYCL simpler and easier to document. It is also expected
 to improve the performance of many SYCL applications, where `event` objects are
 not required to describe application behavior.
 
-All functions proposed in this extension accept as their first argument an
+All functions in this extension accept as their first argument an
 object that represents where a command should be submitted, allowing the new
 functions to be used either at command-group scope or as a replacement for
 existing queue shortcuts. A future version of this extension may adjust this
@@ -89,7 +90,7 @@ by accepting a scheduler and returning a sender).
 
 === Usage example
 
-The example below demonstrates that the syntax proposed here requires only
+The example below demonstrates that the syntax here requires only
 minor changes to existing applications, while retaining their structure.
 
 
@@ -117,7 +118,7 @@ sycl::free(output, q);
 ----
 
 
-==== Proposed syntax
+==== Syntax
 
 [source,c++]
 ----
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index bf3e556a2f3bf..ce88520fe50dd 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -107,6 +107,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_FORWARD_PROGRESS 1
 #define SYCL_EXT_ONEAPI_FREE_FUNCTION_KERNELS 1
 #define SYCL_EXT_ONEAPI_PROD 1
+#define SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS 1
 
 #ifndef __has_include
 #define __has_include(x) 0

From cadd8003485fac3e7920bfc7df362a8a57b34318 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:11:16 -0400
Subject: [PATCH 44/55] [SYCL][ESIMD][E2E] Fix bit shift vector test to not use
 c++20 (#14081)

We don't actually need cpp20, this was left over from a previous
iteration of the test.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../ESIMD/regression/bit_shift_vector_compilation_test.cpp     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp b/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
index 8230f34400c6f..634ef40e40ebb 100644
--- a/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
@@ -6,13 +6,12 @@
 //
 //===---------------------------------------===//
 
-// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
 // This is a basic test to validate the vector bit shifting functions.
 
 #include "../esimd_test_utils.hpp"
-#include <bit>
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;

From 6d591f19dc2ffb1321f790f46776280032b3c9e6 Mon Sep 17 00:00:00 2001
From: Jason Li <jasonlizhengjian@gmail.com>
Date: Fri, 7 Jun 2024 10:22:37 -0400
Subject: [PATCH 45/55] [SYCL][ESIMD] Instruction count performance test 
 (#14033)

(fixed issues with post commit testing)

---------

Co-authored-by: jason1.li <jasonli@ubuntu-server.jf.intel.com>
---
 .../ESIMD/PerformanceTests/BitonicSortK.cpp       |  3 +--
 .../ESIMD/PerformanceTests/BitonicSortKv2.cpp     | 15 +++++++++++++++
 sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp  | 15 +++++++++++++++
 .../ESIMD/PerformanceTests/instruction_count.py   |  7 ++++++-
 .../ESIMD/PerformanceTests/matrix_transpose.cpp   | 15 +++++++++++++++
 sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp | 15 +++++++++++++++
 6 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp
 create mode 100644 sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp
 create mode 100644 sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp
 create mode 100644 sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp

diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
index 3f0dd433f6ca5..d929d86b01b6f 100644
--- a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
@@ -5,12 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: gpu
 // REQUIRES: gpu-intel-dg2 && level_zero
 
 // RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
 // RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
-// RUN: python3 %S/instruction_count.py %t.dir 3452 VC_asmfc04983569d0d4c9__ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
+// RUN: python3 %S/instruction_count.py %t.dir 3452 ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
 // RUN: echo "Baseline from driver version 1.3.29138"
 
 #include "../BitonicSortK.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp
new file mode 100644
index 0000000000000..f2dee3992d0cc
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp
@@ -0,0 +1,15 @@
+//==---------------- BitonicSortKv2.cpp  - DPC++ ESIMD on-device test ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 3456 ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../BitonicSortKv2.cpp"
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp
new file mode 100644
index 0000000000000..60e0a5c5ba26c
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp
@@ -0,0 +1,15 @@
+//==---------------- Stencil.cpp  - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1699 ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E14Stencil_kernel.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../Stencil.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py b/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
index 5cf25ef38a198..44c7a69a5eea1 100644
--- a/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
@@ -7,11 +7,16 @@ def main(directory, max_count, target_file):
     total_count = 0
     pattern = re.compile(r"//\.instCount (\d+)")
 
+    if not os.path.isdir(directory):
+        print(f"Directory {directory} does not exist.")
+        sys.exit(1)
+
     try:
         target_found = False
         for root, dirs, files in os.walk(directory):
             for file in files:
-                if file.endswith(".asm") and file == target_file:
+                print("File: ", file)
+                if file.endswith(".asm") and re.search(target_file + "$", file):
                     target_found = True
                     print("Checking file: ", file)
                     try:
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp
new file mode 100644
index 0000000000000..2923b73f3a241
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp
@@ -0,0 +1,15 @@
+//==---------------- matrix_transpose.cpp  - DPC++ ESIMD on-device test ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1280 ZTSZZ7runTestjjjRdS_ENKUlRN4sycl3_V17handlerEE_clES3_E3K16.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../matrix_transpose.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp
new file mode 100644
index 0000000000000..1528ff7518819
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp
@@ -0,0 +1,15 @@
+//==---------------- stencil2.cpp  - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1699 ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E14Stencil_kernel.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../stencil2.cpp"
\ No newline at end of file

From 4e36825beabb4b4a7435470ac633768dcbd7b376 Mon Sep 17 00:00:00 2001
From: Justin Cai <justin.cai@intel.com>
Date: Fri, 7 Jun 2024 07:55:42 -0700
Subject: [PATCH 46/55] [SYCL] Record aspect names when computing device
 requirements (#13974)

After #13486, aspect name information is visible in `sycl-post-link`
without the use of `!sycl_aspects`, so this PR updates `sycl-post-link`
to use the aspect names that are now available within the
`!sycl_used_aspects` metadata instead of `!sycl_aspects`.

Aditionally, this PR also adds E2E related to optional kernel features
for AOT enabled by these changes
---
 .../llvm/SYCLLowerIR/SYCLDeviceRequirements.h |  14 ++-
 .../SYCLLowerIR/SYCLDeviceRequirements.cpp    |  24 +++--
 .../multiple-filtered-outputs.ll              | 102 ++----------------
 llvm/tools/sycl-post-link/sycl-post-link.cpp  |  39 +------
 sycl/test-e2e/AOT/double.cpp                  |  26 +++++
 sycl/test-e2e/AOT/reqd-sg-size.cpp            |  74 +++++++++++++
 6 files changed, 143 insertions(+), 136 deletions(-)
 create mode 100644 sycl/test-e2e/AOT/double.cpp
 create mode 100644 sycl/test-e2e/AOT/reqd-sg-size.cpp

diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
index abb78b51af154..8891f7f550c5f 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
@@ -30,7 +30,19 @@ class PropertyValue;
 }
 
 struct SYCLDeviceRequirements {
-  std::set<uint32_t> Aspects;
+  struct AspectNameValuePair {
+    llvm::SmallString<64> Name;
+    uint32_t Value;
+    AspectNameValuePair(StringRef Name, uint32_t Value)
+        : Name(Name), Value(Value) {}
+    bool operator<(const AspectNameValuePair &rhs) const {
+      return Value < rhs.Value;
+    }
+    bool operator==(const AspectNameValuePair &rhs) const {
+      return Value == rhs.Value;
+    }
+  };
+  std::set<AspectNameValuePair> Aspects;
   std::set<uint32_t> FixedTarget;
   std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
   std::optional<uint32_t> WorkGroupNumDim;
diff --git a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
index 8ebec7f54013d..60424c04027fa 100644
--- a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
@@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) {
   // Process all functions in the module
   for (const Function &F : MD.getModule()) {
     if (auto *MDN = F.getMetadata("sycl_used_aspects")) {
-      for (auto &MDOp : MDN->operands()) {
-        int64_t Val;
-        if (auto Pair = dyn_cast<MDNode>(MDOp)) {
+      for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+        StringRef AspectName = "";
+        int64_t AspectValue;
+        if (auto Pair = dyn_cast<MDNode>(MDN->getOperand(I))) {
           assert(Pair->getNumOperands() == 2);
-          Val = mdconst::extract<ConstantInt>(Pair->getOperand(1))
-                    ->getZExtValue();
+          AspectName = ExtractStringFromMDNodeOperand(Pair, 0);
+          AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1);
         } else {
-          Val = mdconst::extract<ConstantInt>(MDOp)->getZExtValue();
+          AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I);
         }
         // Don't put internal aspects (with negative integer value) into the
         // requirements, they are used only for device image splitting.
-        if (Val >= 0)
-          Reqs.Aspects.insert(Val);
+        if (AspectValue >= 0)
+          Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)});
       }
     }
 
@@ -133,8 +134,11 @@ std::map<StringRef, util::PropertyValue> SYCLDeviceRequirements::asMap() const {
   // For all properties except for "aspects", we'll only add the
   // value to the map if the corresponding value from
   // SYCLDeviceRequirements has a value/is non-empty.
-  Requirements["aspects"] =
-      std::vector<uint32_t>(Aspects.begin(), Aspects.end());
+  std::vector<uint32_t> AspectValues;
+  AspectValues.reserve(Aspects.size());
+  for (auto Aspect : Aspects)
+    AspectValues.push_back(Aspect.Value);
+  Requirements["aspects"] = std::move(AspectValues);
 
   if (!FixedTarget.empty())
     Requirements["fixed_target"] =
diff --git a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
index 1f014410d0a1c..7c2ab6e91b925 100644
--- a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
+++ b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
@@ -65,136 +65,56 @@
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
+define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 {
 entry:
-  %0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
+  %0 = load double, ptr addrspace(1) %_arg_out, align 8
   %mul.i = fmul double %0, 2.000000e-01
-  store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
+  store double %mul.i, ptr addrspace(1) %_arg_out, align 8
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
+define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 {
 entry:
-  %0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
+  %0 = load float, ptr addrspace(1) %_arg_out, align 4
   %mul.i = fmul float %0, 0x3FC99999A0000000
-  store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
+  store float %mul.i, ptr addrspace(1) %_arg_out, align 4
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 {
 entry:
   ret void
 }
 
-declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
-
 attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" }
 
 !llvm.module.flags = !{!0, !1}
 !opencl.spir.version = !{!2}
 !spirv.Source = !{!3}
-!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63}
 !llvm.ident = !{!64}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"frame-pointer", i32 2}
 !2 = !{i32 1, i32 2}
 !3 = !{i32 4, i32 100000}
-!4 = !{!"cpu", i32 1}
-!5 = !{!"gpu", i32 2}
-!6 = !{!"accelerator", i32 3}
-!7 = !{!"custom", i32 4}
-!8 = !{!"fp16", i32 5}
 !9 = !{!"fp64", i32 6}
-!10 = !{!"image", i32 9}
-!11 = !{!"online_compiler", i32 10}
-!12 = !{!"online_linker", i32 11}
-!13 = !{!"queue_profiling", i32 12}
-!14 = !{!"usm_device_allocations", i32 13}
-!15 = !{!"usm_host_allocations", i32 14}
-!16 = !{!"usm_shared_allocations", i32 15}
-!17 = !{!"usm_system_allocations", i32 17}
-!18 = !{!"ext_intel_pci_address", i32 18}
-!19 = !{!"ext_intel_gpu_eu_count", i32 19}
-!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
-!21 = !{!"ext_intel_gpu_slices", i32 21}
-!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
-!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
-!24 = !{!"ext_intel_max_mem_bandwidth", i32 24}
-!25 = !{!"ext_intel_mem_channel", i32 25}
-!26 = !{!"usm_atomic_host_allocations", i32 26}
-!27 = !{!"usm_atomic_shared_allocations", i32 27}
-!28 = !{!"atomic64", i32 28}
-!29 = !{!"ext_intel_device_info_uuid", i32 29}
-!30 = !{!"ext_oneapi_srgb", i32 30}
-!31 = !{!"ext_oneapi_native_assert", i32 31}
-!32 = !{!"host_debuggable", i32 32}
-!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
-!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
-!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
-!36 = !{!"ext_intel_free_memory", i32 36}
-!37 = !{!"ext_intel_device_id", i32 37}
-!38 = !{!"ext_intel_memory_clock_rate", i32 38}
-!39 = !{!"ext_intel_memory_bus_width", i32 39}
-!40 = !{!"emulated", i32 40}
-!41 = !{!"ext_intel_legacy_image", i32 41}
-!42 = !{!"ext_oneapi_bindless_images", i32 42}
-!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43}
-!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44}
-!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45}
-!46 = !{!"ext_oneapi_interop_memory_import", i32 46}
-!47 = !{!"ext_oneapi_interop_memory_export", i32 47}
-!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48}
-!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49}
-!50 = !{!"ext_oneapi_mipmap", i32 50}
-!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51}
-!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52}
-!53 = !{!"ext_intel_esimd", i32 53}
-!54 = !{!"ext_oneapi_ballot_group", i32 54}
-!55 = !{!"ext_oneapi_fixed_size_group", i32 55}
-!56 = !{!"ext_oneapi_opportunistic_group", i32 56}
-!57 = !{!"ext_oneapi_tangle_group", i32 57}
-!58 = !{!"ext_intel_matrix", i32 58}
-!59 = !{!"int64_base_atomics", i32 7}
-!60 = !{!"int64_extended_atomics", i32 8}
-!61 = !{!"usm_system_allocator", i32 17}
-!62 = !{!"usm_restricted_shared_allocations", i32 16}
-!63 = !{!"host", i32 0}
 !64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"}
-!65 = !{i32 233}
-!66 = !{i32 -1}
-!67 = !{i32 6}
-!68 = !{}
-!69 = !{i1 false}
-!70 = !{!71, !71, i64 0}
-!71 = !{!"double", !72, i64 0}
-!72 = !{!"omnipotent char", !73, i64 0}
-!73 = !{!"Simple C++ TBAA"}
-!74 = !{i32 364}
-!75 = !{!76, !76, i64 0}
-!76 = !{!"float", !72, i64 0}
-!77 = !{i32 529}
+!67 = !{!9}
 !78 = !{i32 8}
 !79 = !{i32 16}
 !80 = !{i32 32}
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 6c6db956c383a..0d060e0c9aaf9 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -1014,41 +1014,12 @@ bool isTargetCompatibleWithModule(const std::optional<std::string> &Target,
       DeviceConfigFile::TargetTable[*Target];
   const SYCLDeviceRequirements &ModuleReqs =
       IrMD.getOrComputeDeviceRequirements();
-  // The device config file data stores the target's supported
-  // aspects as a vector of the strings, so we need to translate
-  // the values to a common format.
-  const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects");
-  if (Node) {
-    SmallMapVector<StringRef, int, 32> AspectNameToValue;
-    for (const MDNode *N : Node->operands()) {
-      assert(N->getNumOperands() == 2 &&
-             "Each operand of sycl_aspects must be a pair.");
-
-      // The aspect's name is the first operand.
-      const auto *AspectName = cast<MDString>(N->getOperand(0));
-
-      // The aspect's integral value is the second operand.
-      const auto *AspectCAM = cast<ConstantAsMetadata>(N->getOperand(1));
-      const Constant *AspectC = AspectCAM->getValue();
-
-      AspectNameToValue[AspectName->getString()] =
-          cast<ConstantInt>(AspectC)->getSExtValue();
-    }
-
-    // Make the set of aspects values the target supports.
-    SmallSet<int64_t, 32> TargetAspectValueSet;
-    for (const auto &Aspect : TargetInfo.aspects) {
-      auto It = AspectNameToValue.find(Aspect);
-      assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!");
-      TargetAspectValueSet.insert(It->second);
-    }
 
-    // Now check to see if all the requirements of the input module
-    // are compatbile with the target.
-    for (const auto &Aspect : ModuleReqs.Aspects) {
-      if (!TargetAspectValueSet.contains(Aspect))
-        return false;
-    }
+  // Check to see if all the requirements of the input module
+  // are compatbile with the target.
+  for (const auto &Aspect : ModuleReqs.Aspects) {
+    if (!is_contained(TargetInfo.aspects, Aspect.Name))
+      return false;
   }
 
   // Check if module sub group size is compatible with the target.
diff --git a/sycl/test-e2e/AOT/double.cpp b/sycl/test-e2e/AOT/double.cpp
new file mode 100644
index 0000000000000..813fb194e017b
--- /dev/null
+++ b/sycl/test-e2e/AOT/double.cpp
@@ -0,0 +1,26 @@
+// This test ensures that a program that has a kernel
+// using fp64 can be compiled AOT.
+
+// REQUIRES: ocloc
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
+
+#include <sycl/detail/core.hpp>
+
+using namespace sycl;
+
+int main() {
+  queue q;
+  if (q.get_device().has(aspect::fp64)) {
+    double d = 2.5;
+    {
+      buffer<double, 1> buf(&d, 1);
+      q.submit([&](handler &cgh) {
+        accessor acc{buf, cgh};
+        cgh.single_task([=] { acc[0] *= 2; });
+      });
+    }
+    std::cout << d << "\n";
+  }
+}
diff --git a/sycl/test-e2e/AOT/reqd-sg-size.cpp b/sycl/test-e2e/AOT/reqd-sg-size.cpp
new file mode 100644
index 0000000000000..5272f25e83017
--- /dev/null
+++ b/sycl/test-e2e/AOT/reqd-sg-size.cpp
@@ -0,0 +1,74 @@
+// This test ensures that a program that has a kernel
+// using various required sub-group sizes can be compiled AOT.
+
+// REQUIRES: ocloc
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
+
+#include <cstdio>
+#include <iostream>
+
+#include <sycl/detail/core.hpp>
+
+using namespace sycl;
+
+template <int N> class kernel_name;
+
+template <size_t... Ns> struct SubgroupDispatcher {
+  std::vector<std::pair<size_t, size_t>> fails;
+  SubgroupDispatcher(queue &q) : q(q) {}
+
+  void operator()(const std::vector<size_t> &v) {
+    for (auto i : v)
+      (*this)(i);
+  }
+
+  void operator()(size_t n) { (dispatch<Ns>(n), ...); }
+
+private:
+  queue &q;
+
+  template <size_t size> void dispatch(size_t n) {
+    if (n == size) {
+      size_t res = 0;
+      {
+        buffer<size_t, 1> buf(&res, 1);
+        q.submit([&](handler &cgh) {
+          accessor acc{buf, cgh};
+          cgh.parallel_for<kernel_name<size>>(
+              nd_range<1>(1, 1),
+              [=](auto item) [[intel::reqd_sub_group_size(size)]] {
+                acc[0] = item.get_sub_group().get_max_local_range()[0];
+              });
+        });
+      }
+      if (res != size)
+        fails.push_back({res, size});
+    }
+  }
+};
+
+int main() {
+  queue q;
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+  auto sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+  std::cout << "  sub-group sizes supported by the device: " << sizes[0];
+  for (int i = 1; i < sizes.size(); ++i) {
+    std::cout << ", " << sizes[i];
+  }
+  std::cout << '\n';
+
+  using dispatcher_t = SubgroupDispatcher<4, 8, 16, 32, 64, 128>;
+  dispatcher_t dispatcher(q);
+  dispatcher(sizes);
+  if (dispatcher.fails.size() > 0) {
+    for (auto [actual, expected] : dispatcher.fails) {
+      std::cout << "actual:   " << actual << "\n"
+                << "expected: " << expected << "\n";
+    }
+  } else {
+    std::cout << "pass\n";
+  }
+}

From 25bfb0bcb19106ae61e81e01c7a469d9b7e801fc Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Fri, 7 Jun 2024 12:48:31 -0400
Subject: [PATCH 47/55] [SYCL][Doc] Extension spec for "work_group_memory"
 (#13725)

Add a proposed extension specification for `work_group_memory`, a
lighter weight API to allocate device local memory for an nd-range
kernel.

Also related, add a list of restrictions that, when followed, provide a
guarantee that a kernel written in the free-function kernel syntax can
be launched directly via Level Zero or OpenCL.
---
 ..._ext_oneapi_free_function_kernels.asciidoc |  56 ++
 ...sycl_ext_oneapi_work_group_memory.asciidoc | 553 ++++++++++++++++++
 2 files changed, 609 insertions(+)
 create mode 100644 sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
index f2832edc31156..7a471b7fa36c6 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
@@ -773,6 +773,62 @@ int main() {
 ```
 
 
+== {dpcpp} guaranteed compatibility with Level Zero and OpenCL backends
+
+The contents of this section are non-normative and apply only to the {dpcpp}
+implementation.
+Kernels written using the free function kernel syntax can be submitted to a
+device by using the Level Zero or OpenCL backends, without going through the
+SYCL host runtime APIs.
+This works only when the kernel is AOT compiled to native device code using the
+`-fsycl-targets` compiler option.
+
+The interface to the kernel in the native device code module is only guaranteed
+when the kernel adheres to the following restrictions:
+
+* The kernel is written in the free function kernel syntax;
+* The kernel function is declared as `extern "C"`;
+* Each formal argument to the kernel is either a {cpp} trivially copyable type
+  or the `work_group_memory` type (see
+  link:../proposed/sycl_ext_oneapi_work_group_memory.asciidoc[
+  sycl_ext_oneapi_work_group_memory]); and
+* The translation unit containing the kernel is compiled with the
+  `-fno-sycl-dead-args-optimization` option.
+
+Both Level Zero and OpenCL identify a kernel via a _name_ string.
+(See `zeKernelCreate` and `clCreateKernel` in their respective specifications.)
+When a kernel is defined according to the restrictions above, the _name_ is
+guaranteed to be the same as the name of the kernel's function in the {cpp}
+source code but with "++__sycl_kernel_++" prefixed.
+For example, if the function name is "foo", the kernel's name in the native
+device code module is "++__sycl_kernel_foo++".
+
+Both Level Zero and OpenCL set kernel argument values using three pieces of
+information:
+
+* The index of the argument;
+* The size (in bytes) of the value; and
+* A pointer to the start of the value.
+
+(See `zeKernelSetArgumentValue` and `clSetKernelArg` in their respective
+specifications.)
+
+When a kernel is defined according to the restrictions above, the argument
+indices are the same as the positions of the formal kernel arguments in the
+{cpp} source code.
+The first argument has index 0, the next has index 1, etc.
+
+If an argument has a trivially copyable type, the size must be the size of that
+type, and the pointer must point to a memory region that has the same size and
+representation as that trivially copyable type.
+
+If an argument has the type `work_group_memory`, the size must be the size (in
+bytes) of the device local memory that is represented by the
+`work_group_memory` argument.
+The pointer passed to  `zeKernelSetArgumentValue` or `clSetKernelArg` must be
+NULL in this case.
+
+
 == Implementation notes
 
 === Compiler diagnostics
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc
new file mode 100644
index 0000000000000..9a7875c6987ab
--- /dev/null
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc
@@ -0,0 +1,553 @@
+= sycl_ext_oneapi_work_group_memory
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+:endnote: &#8212;{nbsp}end{nbsp}note
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2024 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 8 specification.
+All references below to the "core SYCL specification" or to section numbers in
+the SYCL specification refer to that revision.
+
+This extension also depends on the following other SYCL extensions:
+
+* link:../experimental/sycl_ext_oneapi_properties.asciidoc[
+  sycl_ext_oneapi_properties]
+
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.
+Interfaces defined in this specification may not be implemented yet or may be
+in a preliminary state.
+The specification itself may also change in incompatible ways before it is
+finalized.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+
+== Overview
+
+This extension adds a lower overhead way to allocate device local memory,
+memory which is shared by all work-items in a work-group.
+The `local_accessor` class in the core SYCL specification provides a mechanism
+to do this also, but `local_accessor` has higher overhead because it
+encapsulates both a pointer to the memory and the size of that memory.
+When a `local_accessor` has multiple dimensions, it contains the size in
+each dimension.
+By comparison, the `work_group_memory` class in this extension encapsulates
+only a pointer to the memory without any size information.
+The functionality of `work_group_memory` is, of course, less than
+`local_accessor`, but many applications do not need the extra features.
+
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.
+An implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY` to one of the values defined in the table
+below.
+Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+=== New `work_group_memory` class
+
+This extension adds the following new class:
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+template<typename DataT, typename PropertyListT = empty_properties_t>
+class work_group_memory {
+ public:
+  using value_type = std::remove_all_extents_t<DataT>;
+
+  work_group_memory();
+  work_group_memory(const work_group_memory& rhs);
+  work_group_memory(handler& cgh);
+  work_group_memory(size_t num, handler& cgh);
+  work_group_memory& operator=(const work_group_memory& rhs);
+
+  operator DataT&() const;
+  const work_group_memory& operator=(const DataT& value) const;
+  DataT* operator&() const;
+
+  template<access::decorated IsDecorated = access::decorated::no>
+  multi_ptr<value_type, access::address_space::local_space, IsDecorated> get_multi_ptr() const;
+};
+
+} // namespace sycl::ext::oneapi::experimental
+----
+
+The `work_group_memory` class allocates device local memory and provides access
+to this memory from within a SYCL kernel function.
+The local memory that is allocated is shared between all work-items of a
+work-group.
+If multiple work-groups execute simultaneously, each of those work-group
+receives its own independent copy of the allocated local memory.
+
+The `work_group_memory` type is a legal kernel parameter type as defined in
+section 4.12.4 "Rules for parameter passing to kernels" of the core SYCL
+specification.
+Applications typically construct an object of type `work_group_memory` in
+command group scope, pass the object as a kernel parameter, and then reference
+the object inside the kernel in order to access the device local memory that it
+contains.
+
+The `work_group_memory` class may only be used in an nd-range kernel.
+If an application passes a `work_group_memory` object as an argument to a
+single-task kernel or to a simple "range" kernel, the implementation must throw
+a synchronous `exception` with the `errc::kernel_argument` error code when the
+kernel is enqueued.
+
+The `DataT` template parameter identifies the type of the objects created in
+device local memory, and this type must be one of the types that is supported
+in device code.
+In order to create an array of objects, `DataT` should be an array type.
+For example, `work_group_memory<float[10]>` creates an array of 10 `float`
+objects in device local memory.
+In order to create an array of objects where the number of elements is
+determined at runtime, specify an unbounded array type such as
+`work_group_memory<float[]>` and use the constructor overload that takes a
+`num` parameter.
+
+If `DataT` is an implicit-lifetime type as defined in the {cpp} core language,
+`work_group_memory` implicitly creates objects of that type with indeterminate
+values.
+For other types, `work_group_memory` merely allocates uninitialized memory, and
+the application is responsible for constructing objects in that memory (e.g. by
+calling placement-new).
+
+The `PropertyListT` template parameter currently has no meaning and must have
+its default value of `empty_properties_t`.
+This template parameter may be used in the future to associate compile-time
+properties with the `work_group_memory`.
+
+==== Type aliases
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+using value_type = std::remove_all_extents_t<DataT>;
+----
+!====
+
+This type alias provides the data type of the device local memory with all
+array extents removed.
+
+==== Constructors and copy assignment
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory();
+----
+!====
+
+_Effects:_ Constructs a "dummy" `work_group_memory` object that does not
+represent any device local memory.
+The only valid operation for a dummy object is the copy-assignment operator,
+which overwrites the object with the right-hand-side of the assignment.
+Passing a dummy object as a kernel argument or calling any of its other
+member functions or operators produces undefined behavior.
+
+[_Note:_ This constructor may be called in either host code or device code.
+_{endnote}_]
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory(const work_group_memory& rhs);
+----
+!====
+
+_Effects:_ Constructs a `work_group_memory` object which is a copy of the
+`rhs` object.
+The new object represents the same underlying device local memory as `rhs`.
+
+[_Note:_ This constructor may be called in either host code or device code.
+_{endnote}_]
+
+[_Note:_ The copied object does not always represent the same underlying device
+local memory when the copy constructor is called in host code.
+See the open issues.
+_{endnote}_]
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory(handler& cgh);              (1)
+work_group_memory(size_t num, handler& cgh);  (2)
+----
+!====
+
+_Preconditions:_ These constructors must be called from host code.
+
+_Constraints (1):_ Available only when `DataT` is not an unbounded array.
+
+_Constraints (2):_ Available only when `DataT` is an unbounded array.
+
+_Effects:_ Constructs a `work_group_memory` object which represents device
+local memory of type `DataT` in the kernel that is enqueued via the `cgh`
+handler.
+Overload (2) uses `num` to determine the number of elements in the unbounded
+array `DataT`.
+
+_Remarks:_ Attempting to pass the `work_group_memory` object as an argument
+to a kernel that is _not_ launched via the `cgh` handler produces undefined
+behavior.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory& operator=(const work_group_memory& rhs);
+----
+!====
+
+_Effects:_ Replaces the `work_group_memory` object with a copy of the `rhs` object.
+The replaced object represents the same underlying device local memory as `rhs`.
+
+_Returns:_ A reference to the `work_group_memory` object.
+
+[_Note:_ This operator may be called in either host code or device code.
+_{endnote}_]
+
+[_Note:_ The replaced object does not always represent the same underlying
+device local memory when the assignment operator is called in host code.
+See the open issues.
+_{endnote}_]
+
+==== Member functions and operators
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+operator DataT&() const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Effects:_ Implicit conversion to the underlying `DataT`.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+const work_group_memory& operator=(const DataT& value) const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Constraints:_ Available only when `DataT` is not an array.
+
+_Effects:_ Assigns the value `value` to the underlying device local memory
+object.
+
+_Returns:_ A reference to the `work_group_memory` object.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+DataT* operator&() const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Returns:_ A pointer to the underlying device local memory object.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+template<access::decorated IsDecorated = access::decorated::no>
+multi_ptr<value_type, access::address_space::local_space, IsDecorated> get_multi_ptr() const;
+----
+!====
+
+_Preconditions:_ This function must be called from device code.
+
+_Returns:_ A `multi_ptr` to the underlying device local memory object.
+
+
+== Examples
+
+=== Basic usage
+
+The following example illustrates a typical use of the `work_group_memory`
+class.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+int main() {
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    // Allocate one element for each work-item in the work-group.
+    syclexp::work_group_memory<int[WGSIZE]> mem{cgh};
+
+    sycl::nd_range ndr{{SIZE}, {WGSIZE}};
+    cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
+      size_t id = it.get_local_linear_id();
+
+      // Each work-item has its own dedicated element of the array.
+      mem[id] = /*...*/;
+    });
+  }).wait();
+}
+----
+
+=== Operations on types
+
+The following example illustrates various operations that can be done with the
+`work_group_memory` class when it is templated with different `DataT` types.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+struct point {
+  int x;
+  int y;
+};
+
+int main() {
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    syclexp::work_group_memory<int>       mem1{cgh};    // scalar
+    syclexp::work_group_memory<int[10]>   mem2{cgh};    // bounded array
+    syclexp::work_group_memory<int[]>     mem3{5, cgh}; // unbounded array
+    syclexp::work_group_memory<int[][10]> mem4{2, cgh}; // multi-dimensional array
+    syclexp::work_group_memory<point[10]> mem5{cgh};    // array of struct
+
+    sycl::nd_range ndr{{SIZE}, {WGSIZE}};
+    cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
+      if (it.get_group().leader()) {
+        // A "work_group_memory" templated on a scalar type acts much like the
+        // enclosed scalar type.
+        ++mem1;
+        mem1++;
+        mem1 += 1;
+        mem1 = mem1 + 1;
+        int *p1 = &mem1;
+
+        // A "work_group_memory" templated on an array type (either bounded or
+        // unbounded) acts like an array.
+        ++mem2[4];
+        mem2[4]++;
+        mem2[4] = mem2[4] + 1;
+        int *p2 = &mem2[4];
+
+        // A multi-dimensional array works as expected.
+        mem4[1][5] = mem4[1][5] + 1;
+        mem4[1][7] = mem4[1][7] + 1;
+
+        // An array of structs works as expected too.
+        mem5[1].x++;
+        mem5[1].y = mem5[1].y + 1;
+      }
+    });
+  }).wait();
+}
+----
+
+=== Usage with a free function kernel
+
+The following example illustrates usage of `work_group_memory` in a free
+function kernel.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+namespace syclext = sycl::ext::oneapi;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
+void mykernel(syclexp::work_group_memory<int[WGSIZE]> mem) {
+  size_t id = syclext::this_work_item::get_nd_item().get_local_linear_id();
+
+  // Each work-item has its own dedicated element of the device local memory
+  // array.
+  mem[id] = /*...*/;
+}
+
+int main() {
+  sycl::queue q;
+  sycl::context ctxt = q.get_context();
+
+  // Get the kernel object for the "mykernel" kernel.
+  auto exe_bndl =
+    syclexp::get_kernel_bundle<mykernel, sycl::bundle_state::executable>(ctxt);
+  sycl::kernel k_mykernel = exe_bndl.ext_oneapi_get_kernel<mykernel>();
+
+  q.submit([&](sycl::handler &cgh) {
+    // Allocate an array of device local memory with one element for each
+    // work-item in the work-group.
+    syclexp::work_group_memory<int[WGSIZE]> mem{cgh};
+    cgh.set_args(mem);
+
+    sycl::nd_range ndr{{NUM}, {WGSIZE}};
+    cgh.parallel_for(ndr, k_mykernel);
+  }).wait();
+}
+----
+
+
+== Issues
+
+* We have not agreed on the way in which `work_group_memory` should be created
+  when there is a property list.
+  One option is to add a new constructor that takes a `PropertyListT` parameter
+  and use CTAD to deduce the class template parameters.
+  However, we need some way to deduce `DataT` because CTAD does not work unless
+  it deduces all of the template parameters.
+  This leads to a constructor that requires a tag-type parameter like:
++
+[source,c++]
+----
+template<typename T>
+struct type_tag {};
+
+template<typename T>
+inline constexpr type_tag<T> type;
+
+template<typename DataT, typename PropertyListT = empty_properties_t>
+class work_group_memory {
+  work_group_memory(const type_tag<DataT>&, handler& cgh,
+                    const PropertyListT& props = {});
+};
+
+// Deduction guide for the constructor that takes "type_tag".
+template<typename DataT, typename PropertyListT>
+work_group_memory(const type_tag<DataT>&, handler&, const PropertyListT&) ->
+  work_group_memory<DataT, PropertyListT>;
+----
++
+Usage would be like:
++
+[source,c++]
+----
+syclexp::work_group_memory mem{syclexp::type<int[10]>, cgh, props};
+----
++
+Another option is to add a factory function like:
++
+[source,c++]
+----
+template<typename DataT, typename PropertyListT = empty_properties_t>
+work_group_memory<DataT, PropertyListT>
+make_work_group_memory(handler& cgh, const PropertyListT& props = {});
+----
++
+In which case, usage would be like:
++
+[source,c++]
+----
+auto mem = syclexp::make_work_group_memory<int[10]>(cgh, props);
+----
++
+We decided to defer this decision for now because we don't have any properties
+defined for this class yet anyways.
+
+* The copy constructor and copy assignment operator say that the copied object
+  "represents the same underlying device local memory as ``rhs``".
+  This is not currently the case in {dpcpp} when the copy happens in host code.
+  If you pass two `work_group_memory` objects as kernel parameters, each object
+  creates a unique device local memory region, even if one `work_group_memory`
+  object is a copy of the other.
+  The `local_accessor` class behaves the same way.
+  See https://github.com/KhronosGroup/SYCL-Docs/issues/552[this issue] against
+  the SYCL specification.

From 4fcc7449fcfd331541e35ed21d2b4dece3913120 Mon Sep 17 00:00:00 2001
From: David Garcia Orozco <david.garcia.orozco@intel.com>
Date: Fri, 7 Jun 2024 13:29:50 -0600
Subject: [PATCH 48/55] [SYCL][E2E] Remove uses of OpenCL primitives in
 Basic/image e2e tests (#14084)

---
 sycl/test-e2e/Basic/image/image.cpp           |   2 +-
 .../image/image_accessor_readsampler.cpp      | 144 +++++++--------
 .../Basic/image/image_accessor_readwrite.cpp  | 170 +++++++++---------
 .../image/image_accessor_readwrite_half.cpp   |  62 ++++---
 4 files changed, 180 insertions(+), 198 deletions(-)

diff --git a/sycl/test-e2e/Basic/image/image.cpp b/sycl/test-e2e/Basic/image/image.cpp
index 9d7165150194a..3fdc767420ca7 100644
--- a/sycl/test-e2e/Basic/image/image.cpp
+++ b/sycl/test-e2e/Basic/image/image.cpp
@@ -91,7 +91,7 @@ int main() {
 
     constexpr int dims = 1;
 
-    using data_img = sycl::cl_float4;
+    using data_img = sycl::float4;
     constexpr auto mode_img = sycl::access::mode::read;
     constexpr auto target_img = sycl::target::image;
     const auto range_img = sycl::range<dims>(3);
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp b/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
index 78cdc0bd1e41f..20c7ecb4fec8a 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
@@ -26,14 +26,13 @@ namespace s = sycl;
 
 template <int unique_number> class kernel_class;
 
-void validateReadData(s::cl_float4 ReadData, s::cl_float4 ExpectedColor,
-                      s::cl_int precision = 1) {
+void validateReadData(s::float4 ReadData, s::float4 ExpectedColor,
+                      int precision = 1) {
   // Maximum difference of 1.5 ULP is allowed when precision = 1.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
-  s::cl_int DataIsCorrect =
-      s::all((Diff <= precision) && (Diff >= (-precision)));
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
+  int DataIsCorrect = s::all((Diff <= precision) && (Diff >= (-precision)));
 #if DEBUG_OUTPUT
   {
     if (DataIsCorrect) {
@@ -49,28 +48,30 @@ void validateReadData(s::cl_float4 ReadData, s::cl_float4 ExpectedColor,
     Diff.dump();
   }
 #else
-  { assert(DataIsCorrect); }
+  {
+    assert(DataIsCorrect);
+  }
 #endif
 }
 
 template <int i>
-void checkReadSampler(char *host_ptr, s::sampler Sampler, s::cl_float4 Coord,
-                      s::cl_float4 ExpectedColor, s::cl_int precision = 1) {
+void checkReadSampler(char *host_ptr, s::sampler Sampler, s::float4 Coord,
+                      s::float4 ExpectedColor, int precision = 1) {
 
-  s::cl_float4 ReadData;
+  s::float4 ReadData;
   {
     // image with dim = 3
     s::image<3> Img(host_ptr, s::image_channel_order::rgba,
                     s::image_channel_type::snorm_int8, s::range<3>{2, 3, 4});
     s::queue myQueue;
-    s::buffer<s::cl_float4, 1> ReadDataBuf(&ReadData, s::range<1>(1));
+    s::buffer<s::float4, 1> ReadDataBuf(&ReadData, s::range<1>(1));
     myQueue.submit([&](s::handler &cgh) {
-      auto ReadAcc = Img.get_access<s::cl_float4, s::access::mode::read>(cgh);
-      s::accessor<s::cl_float4, 1, s::access::mode::write> ReadDataBufAcc(
+      auto ReadAcc = Img.get_access<s::float4, s::access::mode::read>(cgh);
+      s::accessor<s::float4, 1, s::access::mode::write> ReadDataBufAcc(
           ReadDataBuf, cgh);
 
       cgh.single_task<class kernel_class<i>>([=]() {
-        s::cl_float4 RetColor = ReadAcc.read(Coord, Sampler);
+        s::float4 RetColor = ReadAcc.read(Coord, Sampler);
         ReadDataBufAcc[0] = RetColor;
       });
     });
@@ -90,9 +91,8 @@ void checkSamplerNearest() {
   // addressing_mode::mirrored_repeat
   {
     // Out-of-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::nearest);
@@ -102,9 +102,8 @@ void checkSamplerNearest() {
   // addressing_mode::repeat
   {
     // Out-of-range repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::nearest);
@@ -114,9 +113,8 @@ void checkSamplerNearest() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range Edge Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::nearest);
@@ -126,8 +124,8 @@ void checkSamplerNearest() {
   // addressing_mode::clamp
   {
     // Out-of-range Border Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.0f, 0.0f, 0.0f);
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::nearest);
@@ -137,9 +135,8 @@ void checkSamplerNearest() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.0f, 0.5f, 0.75f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(80.0f, 81.0f, 82.0f, 83.0f) / 127.0f;
+    s::float4 Coord(0.0f, 0.5f, 0.75f, 0.0f);
+    s::float4 ExpectedValue = s::float4(80.0f, 81.0f, 82.0f, 83.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::none, s::filtering_mode::nearest);
@@ -149,9 +146,8 @@ void checkSamplerNearest() {
   // B. coordinate_normalization_mode::unnormalized
   // addressing_mode::clamp_to_edge
   {
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::nearest);
@@ -160,9 +156,8 @@ void checkSamplerNearest() {
 
   // addressing_mode::clamp
   {
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::nearest);
@@ -172,9 +167,8 @@ void checkSamplerNearest() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.0f, 1.0f, 2.0f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.0f, 2.0f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::none, s::filtering_mode::nearest);
@@ -190,7 +184,7 @@ void checkSamplerNearest() {
 // value of 15000 ULP is used.
 void checkSamplerLinear() {
 
-  const s::cl_int PrecisionInULP = 15000;
+  const int PrecisionInULP = 15000;
   // create image:
   char host_ptr[100];
   for (int i = 0; i < 100; i++)
@@ -201,9 +195,8 @@ void checkSamplerLinear() {
   // addressing_mode::mirrored_repeat
   {
     // Out-of-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(44.0f, 45.0f, 46.0f, 47.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(44.0f, 45.0f, 46.0f, 47.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::linear);
@@ -212,9 +205,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(42.8f, 43.8f, 44.8f, 45.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
+    s::float4 ExpectedValue = s::float4(42.8f, 43.8f, 44.8f, 45.8f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::linear);
@@ -225,9 +217,8 @@ void checkSamplerLinear() {
   // addressing_mode::repeat
   {
     // Out-of-range repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::linear);
@@ -236,9 +227,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range repeat mode
-    s::cl_float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(44.8f, 45.8f, 46.8f, 47.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
+    s::float4 ExpectedValue = s::float4(44.8f, 45.8f, 46.8f, 47.8f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::linear);
@@ -249,9 +239,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range Edge Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -259,9 +248,8 @@ void checkSamplerLinear() {
                         PrecisionInULP);
   }
   {
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f); // In-range
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(36.8f, 37.8f, 38.8f, 39.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f); // In-range
+    s::float4 ExpectedValue = s::float4(36.8f, 37.8f, 38.8f, 39.8f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -272,8 +260,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.0f, 0.0f, 0.0f);
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -282,9 +270,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(18.4f, 18.9f, 19.4f, 19.9f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(18.4f, 18.9f, 19.4f, 19.9f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -295,9 +282,8 @@ void checkSamplerLinear() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.5f, 0.5f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
+    s::float4 Coord(0.5f, 0.5f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::none, s::filtering_mode::linear);
@@ -309,9 +295,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -320,8 +305,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 1.0f, 2.0f, 3.0f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 1.0f, 2.0f, 3.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -332,9 +317,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 1.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(16.0f, 16.5f, 17.0f, 17.5f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 1.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(16.0f, 16.5f, 17.0f, 17.5f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -343,9 +327,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(0.0f, 0.35f, 0.7f, 1.05f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.35f, 0.7f, 1.05f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -356,9 +339,8 @@ void checkSamplerLinear() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(1.0f, 2.0f, 3.0f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(74.0f, 75.0f, 76.0f, 77.0f) / 127.0f;
+    s::float4 Coord(1.0f, 2.0f, 3.0f, 0.0f);
+    s::float4 ExpectedValue = s::float4(74.0f, 75.0f, 76.0f, 77.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::none, s::filtering_mode::linear);
@@ -369,8 +351,8 @@ void checkSamplerLinear() {
 
 int main() {
 
-  // Note: Currently these functions only check for cl_float4 return datatype,
-  // the test case can be extended to test all return datatypes.
+  // Note: Currently these functions only check for vec<float, 4> return
+  // datatype, the test case can be extended to test all return datatypes.
   checkSamplerNearest();
   checkSamplerLinear();
 }
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
index 0508e2f2e5f2b..1313bc8e96223 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
@@ -28,8 +28,8 @@ template <typename WriteDataT, int ImgType, int read_write> class kernel_class;
 
 template <typename ReadDataT,
           typename = typename std::enable_if<
-              (!(std::is_same_v<ReadDataT, s::cl_float4>) &&
-               !(std::is_same_v<ReadDataT, s::cl_half4>))>::type>
+              (!(std::is_same_v<ReadDataT, s::float4>) &&
+               !(std::is_same_v<ReadDataT, s::half4>))>::type>
 void check_read_data(ReadDataT ReadData, ReadDataT ExpectedColor) {
   using ReadDataType = typename ReadDataT::element_type;
   bool CorrectData = false;
@@ -59,11 +59,11 @@ void check_read_data(ReadDataT ReadData, ReadDataT ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
+void check_read_data(s::float4 ReadData, s::float4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
   bool CorrectData = false;
   if ((Diff.x() <= 1 && Diff.x() >= -1) && (Diff.y() <= 1 && Diff.y() >= -1) &&
       (Diff.z() <= 1 && Diff.z() >= -1) && (Diff.w() <= 1 && Diff.w() >= -1))
@@ -89,10 +89,10 @@ void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_half4 ReadData, s::cl_half4 ExpectedColor) {
+void check_read_data(s::half4 ReadData, s::half4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_float4 ReadDatafloat = ReadData.template convert<float>();
-  s::cl_float4 ExpectedColorfloat = ExpectedColor.template convert<float>();
+  s::float4 ReadDatafloat = ReadData.template convert<float>();
+  s::float4 ExpectedColorfloat = ExpectedColor.template convert<float>();
   check_read_data(ReadDatafloat, ExpectedColorfloat);
 }
 
@@ -142,102 +142,102 @@ void check_read_type_order(char *HostPtr, const s::image_channel_order ImgOrder,
 
 template <typename T> void check(char *);
 
-template <> void check<s::cl_int4>(char *HostPtr) {
+template <> void check<s::int4>(char *HostPtr) {
   // valid channel types:
   // s::image_channel_type::signed_int8,
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int8>(
+  write_type_order<s::int4, s::image_channel_type::signed_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int8>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_char>::max(),
-                 std::numeric_limits<s::cl_char>::min(), 123, 0));
+      s::int4(std::numeric_limits<char>::max(),
+              std::numeric_limits<char>::min(), 123, 0));
 
   // s::image_channel_type::signed_int16,
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int16>(
+  write_type_order<s::int4, s::image_channel_type::signed_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int16>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_short>::max(),
-                 std::numeric_limits<s::cl_short>::min(), 123, 0));
+      s::int4(std::numeric_limits<short>::max(),
+              std::numeric_limits<short>::min(), 123, 0));
 
   // s::image_channel_type::signed_int32.
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int32>(
+  write_type_order<s::int4, s::image_channel_type::signed_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int32>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
 };
 
-template <> void check<s::cl_uint4>(char *HostPtr) {
-  // Calling only valid channel types with s::cl_uint4.
+template <> void check<s::uint4>(char *HostPtr) {
+  // Calling only valid channel types with s::uint4.
   // s::image_channel_type::signed_int8
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int8>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int8>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uchar>::max(),
-                  std::numeric_limits<s::cl_uchar>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned char>::max(),
+               std::numeric_limits<unsigned char>::min(), 123, 0));
 
   // s::image_channel_type::signed_int16
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int16>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int16>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_ushort>::max(),
-                  std::numeric_limits<s::cl_ushort>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned short>::max(),
+               std::numeric_limits<unsigned short>::min(), 123, 0));
 
   // s::image_channel_type::signed_int32
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int32>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int32>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
 };
 
-template <> void check<s::cl_float4>(char *HostPtr) {
-  // Calling only valid channel types with s::cl_float4.
+template <> void check<s::float4>(char *HostPtr) {
+  // Calling only valid channel types with s::float4.
   // TODO: Correct the values below.
   // s::image_channel_type::snorm_int8,
-  write_type_order<s::cl_float4, s::image_channel_type::snorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::snorm_int8>(
+  write_type_order<s::float4, s::image_channel_type::snorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::snorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
+      s::float4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
 
   // s::image_channel_type::snorm_int16,
-  write_type_order<s::cl_float4, s::image_channel_type::snorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::snorm_int16>(
+  write_type_order<s::float4, s::image_channel_type::snorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::snorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
+      s::float4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
 
   // s::image_channel_type::unorm_int8,
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::unorm_int8>(
+  write_type_order<s::float4, s::image_channel_type::unorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::unorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
+      s::float4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
 
   // s::image_channel_type::unorm_int16
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::unorm_int16>(
+  write_type_order<s::float4, s::image_channel_type::unorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::unorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
+      s::float4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
 
   // s::image_channel_type::unorm_short_565, order::rgbx
   // Currently unsupported since OpenCL has no information on this.
@@ -247,37 +247,39 @@ template <> void check<s::cl_float4>(char *HostPtr) {
   // (CL_IMAGE_FORMAT_NOT_SUPPORTED) s::image_channel_type::unorm_short_555,
   // order::rgbx
   /*
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_short_555>(
-      HostPtr, s::image_channel_order::rgbx, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::unorm_short_555>(
+      HostPtr, s::image_channel_order::rgbx, s::float4(2, -2, 0.375f,
+  0));
 
   // s::image_channel_type::unorm_int_101010, order::rgbx
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int_101010>(
-      HostPtr, s::image_channel_order::rgbx, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::unorm_int_101010>(
+      HostPtr, s::image_channel_order::rgbx, s::float4(2, -2, 0.375f,
+  0));
   */
 
   // s::image_channel_type::fp16
-  write_type_order<s::cl_float4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
 
   // s::image_channel_type::fp32
-  write_type_order<s::cl_float4, s::image_channel_type::fp32>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::fp32>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::fp32>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::fp32>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
 };
 
 int main() {
   // Checking only for dimension=1.
-  // 4 datatypes possible: s::cl_uint4, s::cl_int4, s::cl_float4, s::cl_half4.
-  // half4 datatype is checked in a different test case.
-  // create image:
+  // 4 datatypes possible: s::uint4, s::int4, s::float4,
+  // s::half4. s::half4 datatype is checked in a different test case. create
+  // image:
   char HostPtr[100];
   for (int i = 0; i < 100; i++)
     HostPtr[i] = i;
 
-  check<s::cl_int4>(HostPtr);
-  check<s::cl_uint4>(HostPtr);
-  check<s::cl_float4>(HostPtr);
+  check<s::int4>(HostPtr);
+  check<s::uint4>(HostPtr);
+  check<s::float4>(HostPtr);
 }
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp b/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
index 7fcd17a87302f..88cc8825f2b92 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
@@ -26,16 +26,14 @@ namespace s = sycl;
 
 template <typename WriteDataT, int ImgType, int read_write> class kernel_class;
 
-void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
+void check_read_data(s::float4 ReadData, s::float4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
   bool CorrectData = false;
-  if (((s::cl_int)Diff.x() <= 1 && (s::cl_int)Diff.x() >= -1) &&
-      ((s::cl_int)Diff.y() <= 1 && (s::cl_int)Diff.y() >= -1) &&
-      ((s::cl_int)Diff.z() <= 1 && (s::cl_int)Diff.z() >= -1) &&
-      ((s::cl_int)Diff.w() <= 1 && (s::cl_int)Diff.w() >= -1))
+  if ((Diff.x() <= 1 && Diff.x() >= -1) && (Diff.y() <= 1 && Diff.y() >= -1) &&
+      (Diff.z() <= 1 && Diff.z() >= -1) && (Diff.w() <= 1 && Diff.w() >= -1))
     CorrectData = true;
 
 #if DEBUG_OUTPUT
@@ -59,9 +57,9 @@ void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_half4 ReadData, s::cl_half4 ExpectedColor) {
-  s::cl_float4 ReadDatafloat = ReadData.convert<float>();
-  s::cl_float4 ExpectedColorfloat = ExpectedColor.convert<float>();
+void check_read_data(s::half4 ReadData, s::half4 ExpectedColor) {
+  s::float4 ReadDatafloat = ReadData.convert<float>();
+  s::float4 ExpectedColorfloat = ExpectedColor.convert<float>();
   check_read_data(ReadDatafloat, ExpectedColorfloat);
 }
 
@@ -111,40 +109,40 @@ void check_read_type_order(char *HostPtr, const s::image_channel_order ImgOrder,
 
 void check_half4(char *HostPtr) {
 
-  // Calling only valid channel types with s::cl_half4.
+  // Calling only valid channel types with s::half4.
   // s::image_channel_type::snorm_int8,
-  write_type_order<s::cl_half4, s::image_channel_type::snorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::snorm_int8>(
+  write_type_order<s::half4, s::image_channel_type::snorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::snorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
+      s::half4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
 
   // s::image_channel_type::snorm_int16,
-  write_type_order<s::cl_half4, s::image_channel_type::snorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::snorm_int16>(
+  write_type_order<s::half4, s::image_channel_type::snorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::snorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
+      s::half4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
 
   // s::image_channel_type::unorm_int8,
-  write_type_order<s::cl_half4, s::image_channel_type::unorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::unorm_int8>(
+  write_type_order<s::half4, s::image_channel_type::unorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::unorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
+      s::half4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
 
   // s::image_channel_type::unorm_int16
-  write_type_order<s::cl_half4, s::image_channel_type::unorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(1, -1, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::unorm_int16>(
+  write_type_order<s::half4, s::image_channel_type::unorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(1, -1, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::unorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
+      s::half4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
 
   // s::image_channel_type::fp16
-  write_type_order<s::cl_half4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
+  write_type_order<s::half4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
 };
 
 int main() {

From 0e587edf36d68d6c35aa22851dc1889f19289933 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Fri, 7 Jun 2024 22:07:42 +0200
Subject: [PATCH 49/55] [SYCL] Fix post-commit issue with library dependencies
 (#14094)

---
 llvm/lib/SYCLLowerIR/CMakeLists.txt     | 5 +++--
 llvm/lib/SYCLLowerIR/ModuleSplitter.cpp | 4 +---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 49b9e802e75d3..22ad42e836135 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -88,11 +88,12 @@ add_llvm_component_library(LLVMSYCLLowerIR
   LLVMDemangle
   LLVMTargetParser
   LLVMTransformUtils
-  
+
   LINK_COMPONENTS
   Analysis
+  BitWriter
   Core
-  Passes
+  IRPrinter
   Support
   ipo
   )
diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index fa6e12f0a07d2..900e1578c7adf 100644
--- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -20,7 +20,6 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRPrinter/IRPrintingPasses.h"
-#include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
@@ -1164,8 +1163,7 @@ static Error saveModuleIRInFile(Module &M, StringRef FilePath,
   raw_fd_ostream OS(FD, true);
   ModulePassManager MPM;
   ModuleAnalysisManager MAM;
-  PassBuilder PB;
-  PB.registerModuleAnalyses(MAM);
+  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
   if (OutputAssembly)
     MPM.addPass(PrintModulePass(OS));
   else

From 4222b4ccd6dc499248c8bf026bcdd0f207000b35 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Fri, 7 Jun 2024 14:57:14 -0700
Subject: [PATCH 50/55] [SYCL] Restrict `sycl::vec` and swizzle operations to
 types mentioned in the SPEC (#13947)

Follow-up of and blocked by: https://github.com/intel/llvm/pull/13945

For `vec<std::byte>`, we only allow math operations that are valid on
`std::byte` itself. (https://en.cppreference.com/w/cpp/types/byte)
---
 .../sycl/detail/generic_type_traits.hpp       |  10 +
 sycl/include/sycl/detail/vector_arith.hpp     | 409 ++++++++++++++++++
 sycl/include/sycl/vector_preview.hpp          | 403 ++++-------------
 sycl/test-e2e/Basic/vector/byte.cpp           |  30 ++
 4 files changed, 536 insertions(+), 316 deletions(-)
 create mode 100644 sycl/include/sycl/detail/vector_arith.hpp

diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 3b0ce7988f576..a58493877c3c4 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -252,6 +252,16 @@ inline constexpr bool is_genfloatptr_marray_v =
     (IsDecorated == access::decorated::yes ||
      IsDecorated == access::decorated::no);
 
+template <typename T>
+using is_byte = typename
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+    std::is_same<T, std::byte>;
+#else
+    std::false_type;
+#endif
+
+template <typename T> inline constexpr bool is_byte_v = is_byte<T>::value;
+
 template <typename T>
 using make_floating_point_t = make_type_t<T, gtl::scalar_floating_list>;
 
diff --git a/sycl/include/sycl/detail/vector_arith.hpp b/sycl/include/sycl/detail/vector_arith.hpp
new file mode 100644
index 0000000000000..fb92a77389d7c
--- /dev/null
+++ b/sycl/include/sycl/detail/vector_arith.hpp
@@ -0,0 +1,409 @@
+//=== vector_arith.hpp --- Implementation of arithmetic ops on sycl::vec  ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/aliases.hpp>                    // for half, cl_char, cl_int
+#include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
+#include <sycl/detail/type_list.hpp>           // for is_contained
+#include <sycl/detail/type_traits.hpp>         // for is_floating_point
+
+#include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
+
+#include <cstddef>
+#include <type_traits> // for enable_if_t, is_same
+
+namespace sycl {
+inline namespace _V1 {
+
+template <typename DataT, int NumElem> class vec;
+
+namespace detail {
+
+template <typename VecT> class VecAccess;
+
+// Element type for relational operator return value.
+template <typename DataT>
+using rel_t = typename std::conditional_t<
+    sizeof(DataT) == sizeof(opencl::cl_char), opencl::cl_char,
+    typename std::conditional_t<
+        sizeof(DataT) == sizeof(opencl::cl_short), opencl::cl_short,
+        typename std::conditional_t<
+            sizeof(DataT) == sizeof(opencl::cl_int), opencl::cl_int,
+            typename std::conditional_t<sizeof(DataT) ==
+                                            sizeof(opencl::cl_long),
+                                        opencl::cl_long, bool>>>>;
+
+// Macros to populate binary operation on sycl::vec.
+#if defined(__SYCL_BINOP) || defined(BINOP_BASE)
+#error "Undefine __SYCL_BINOP and BINOP_BASE macro"
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    vec_t Ret;                                                                 \
+    if constexpr (vec_t::IsUsingArrayOnDevice) {                               \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        detail::VecAccess<vec_t>::setValue(                                    \
+            Ret, I,                                                            \
+            (detail::VecAccess<vec_t>::getValue(Lhs, I)                        \
+                 BINOP detail::VecAccess<vec_t>::getValue(Rhs, I)));           \
+      }                                                                        \
+    } else {                                                                   \
+      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
+      if constexpr (std::is_same_v<DataT, bool> && CONVERT) {                  \
+        Ret.ConvertToDataT();                                                  \
+      }                                                                        \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#else // __SYCL_DEVICE_ONLY__
+
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    vec_t Ret{};                                                               \
+    for (size_t I = 0; I < NumElements; ++I)                                   \
+      detail::VecAccess<vec_t>::setValue(                                      \
+          Ret, I,                                                              \
+          (DataT)(vec_data<DataT>::get(                                        \
+              detail::VecAccess<vec_t>::getValue(Lhs, I))                      \
+                      BINOP vec_data<DataT>::get(                              \
+                          detail::VecAccess<vec_t>::getValue(Rhs, I))));       \
+    return Ret;                                                                \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+
+#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT, COND)                           \
+  BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                                   \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const DataT & Rhs) {   \
+    return Lhs BINOP vec_t(Rhs);                                               \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const DataT & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    return vec_t(Lhs) BINOP Rhs;                                               \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> &operator OPASSIGN(                   \
+      vec_t & Lhs, const vec_t & Rhs) {                                        \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <int Num = NumElements, typename T = DataT>                         \
+  friend std::enable_if_t<(Num != 1) && (COND), vec_t &> operator OPASSIGN(    \
+      vec_t & Lhs, const DataT & Rhs) {                                        \
+    Lhs = Lhs BINOP vec_t(Rhs);                                                \
+    return Lhs;                                                                \
+  }
+
+/****************************************************************
+ *                       vec_arith_common
+ *                 /           |             \
+ *                /            |               \
+ *     vec_arith<int>     vec_arith<float> ...   vec_arith<byte>
+ *                \            |               /
+ *                 \           |              /
+ *                        sycl::vec<T>
+ *
+ * vec_arith_common is the base class for vec_arith. It contains
+ * the common math operators of sycl::vec for all types.
+ * vec_arith is the derived class that contains the math operators
+ * specialized for certain types. sycl::vec inherits from vec_arith.
+ * *************************************************************/
+template <typename DataT, int NumElements> class vec_arith_common;
+template <typename DataT> struct vec_helper;
+
+template <typename DataT, int NumElements>
+class vec_arith : public vec_arith_common<DataT, NumElements> {
+protected:
+  using vec_t = vec<DataT, NumElements>;
+  using ocl_t = rel_t<DataT>;
+  template <typename T> using vec_data = vec_helper<T>;
+
+  // operator!.
+  friend vec<rel_t<DataT>, NumElements> operator!(const vec_t &Rhs) {
+    if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
+      vec_t Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
+        detail::VecAccess<vec_t>::setValue(
+            Ret, I,
+            !vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Rhs, I)));
+      }
+      return Ret.template as<vec<rel_t<DataT>, NumElements>>();
+    } else {
+      return vec_t{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
+          .template as<vec<rel_t<DataT>, NumElements>>();
+    }
+  }
+
+  // operator +.
+  friend vec_t operator+(const vec_t &Lhs) {
+    if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
+      vec_t Ret{};
+      for (size_t I = 0; I < NumElements; ++I)
+        detail::VecAccess<vec_t>::setValue(
+            Ret, I,
+            vec_data<DataT>::get(+vec_data<DataT>::get(
+                detail::VecAccess<vec_t>::getValue(Lhs, I))));
+      return Ret;
+    } else {
+      return vec_t{+Lhs.m_Data};
+    }
+  }
+
+  // operator -.
+  friend vec_t operator-(const vec_t &Lhs) {
+    namespace oneapi = sycl::ext::oneapi;
+    vec_t Ret{};
+    if constexpr (vec_t::IsBfloat16 && NumElements == 1) {
+      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
+      oneapi::bfloat16 w = -v;
+      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
+    } else if constexpr (vec_t::IsBfloat16) {
+      for (size_t I = 0; I < NumElements; I++) {
+        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
+        oneapi::bfloat16 w = -v;
+        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
+      }
+    } else if constexpr (vec_t::IsUsingArrayOnDevice ||
+                         vec_t::IsUsingArrayOnHost) {
+      for (size_t I = 0; I < NumElements; ++I)
+        detail::VecAccess<vec_t>::setValue(
+            Ret, I,
+            vec_data<DataT>::get(-vec_data<DataT>::get(
+                detail::VecAccess<vec_t>::getValue(Lhs, I))));
+      return Ret;
+    } else {
+      Ret = vec_t{-Lhs.m_Data};
+      if constexpr (std::is_same_v<DataT, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
+    }
+  }
+
+// Unary operations on sycl::vec
+#ifdef __SYCL_UOP
+#error "Undefine __SYCL_UOP macro"
+#endif
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend vec_t &operator UOP(vec_t & Rhs) {                                    \
+    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec_t operator UOP(vec_t &Lhs, int) {                                 \
+    vec_t Ret(Lhs);                                                            \
+    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Ret;                                                                \
+  }
+
+  __SYCL_UOP(++, +=)
+  __SYCL_UOP(--, -=)
+#undef __SYCL_UOP
+
+  // The logical operations on scalar types results in 0/1, while for vec<>,
+  // logical operations should result in 0 and -1 (similar to OpenCL vectors).
+  // That's why, for vec<DataT, 1>, we need to invert the result of the logical
+  // operations since we store vec<DataT, 1> as scalar type on the device.
+#if defined(__SYCL_RELLOGOP) || defined(RELLOGOP_BASE)
+#error "Undefine __SYCL_RELLOGOP and RELLOGOP_BASE macro."
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const vec_t & Rhs) {                                  \
+    vec<ocl_t, NumElements> Ret{};                                             \
+    /* This special case is needed since there are no standard operator||   */ \
+    /* or operator&& functions for std::array.                              */ \
+    if constexpr (vec_t::IsUsingArrayOnDevice &&                               \
+                  (std::string_view(#RELLOGOP) == "||" ||                      \
+                   std::string_view(#RELLOGOP) == "&&")) {                     \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        /* We cannot use SetValue here as the operator is not a friend of*/    \
+        /* Ret on Windows. */                                                  \
+        Ret[I] = static_cast<ocl_t>(                                           \
+            -(vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Lhs, I)) \
+                  RELLOGOP vec_data<DataT>::get(                               \
+                      detail::VecAccess<vec_t>::getValue(Rhs, I))));           \
+      }                                                                        \
+    } else {                                                                   \
+      Ret = vec<ocl_t, NumElements>(                                           \
+          (typename vec<ocl_t, NumElements>::vector_t)(                        \
+              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
+      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
+        Ret *= -1;                                                             \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#else // __SYCL_DEVICE_ONLY__
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const vec_t & Rhs) {                                  \
+    vec<ocl_t, NumElements> Ret{};                                             \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      /* We cannot use SetValue here as the operator is not a friend of*/      \
+      /* Ret on Windows. */                                                    \
+      Ret[I] = static_cast<ocl_t>(                                             \
+          -(vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Lhs, I))   \
+                RELLOGOP vec_data<DataT>::get(                                 \
+                    detail::VecAccess<vec_t>::getValue(Rhs, I))));             \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#endif
+
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  RELLOGOP_BASE(RELLOGOP, COND)                                                \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const DataT & Rhs) {                                  \
+    return Lhs RELLOGOP vec_t(Rhs);                                            \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const DataT & Lhs, const vec_t & Rhs) {                                  \
+    return vec_t(Lhs) RELLOGOP Rhs;                                            \
+  }
+
+  // OP is: ==, !=, <, >, <=, >=, &&, ||
+  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
+  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
+  __SYCL_RELLOGOP(==, true)
+  __SYCL_RELLOGOP(!=, true)
+  __SYCL_RELLOGOP(>, true)
+  __SYCL_RELLOGOP(<, true)
+  __SYCL_RELLOGOP(>=, true)
+  __SYCL_RELLOGOP(<=, true)
+
+  // Only available to integral types.
+  __SYCL_RELLOGOP(&&, (!detail::is_vgenfloat_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_vgenfloat_v<T>))
+#undef __SYCL_RELLOGOP
+#undef RELLOGOP_BASE
+
+  // Binary operations on sycl::vec<> for all types except std::byte.
+  __SYCL_BINOP(+, +=, true, true)
+  __SYCL_BINOP(-, -=, true, true)
+  __SYCL_BINOP(*, *=, false, true)
+  __SYCL_BINOP(/, /=, false, true)
+
+  // The following OPs are available only when: DataT != cl_float &&
+  // DataT != cl_double && DataT != cl_half && DataT != BF16.
+  __SYCL_BINOP(%, %=, false, (!detail::is_vgenfloat_v<T>))
+  // Bitwise operations are allowed for std::byte.
+  __SYCL_BINOP(|, |=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(&, &=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(^, ^=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(>>, >>=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(<<, <<=, true, (!detail::is_vgenfloat_v<DataT>))
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+}; // class vec_arith<>
+
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+template <int NumElements>
+class vec_arith<std::byte, NumElements>
+    : public vec_arith_common<std::byte, NumElements> {
+protected:
+  // NumElements can never be zero. Still using the redundant check to avoid
+  // incomplete type errors.
+  using DataT = typename std::conditional_t<NumElements == 0, int, std::byte>;
+  using vec_t = vec<DataT, NumElements>;
+  template <typename T> using vec_data = vec_helper<T>;
+
+  // Special <<, >> operators for std::byte.
+  // std::byte is not an arithmetic type and it only supports the following
+  // overloads of >> and << operators.
+  //
+  // 1 template <class IntegerType>
+  //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+  //   noexcept;
+  friend vec_t operator<<(const vec_t &Lhs, int shift) {
+    vec_t Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = Lhs[I] << shift;
+    }
+    return Ret;
+  }
+  friend vec_t &operator<<=(vec_t &Lhs, int shift) {
+    Lhs = Lhs << shift;
+    return Lhs;
+  }
+
+  // 2 template <class IntegerType>
+  //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+  //   noexcept;
+  friend vec_t operator>>(const vec_t &Lhs, int shift) {
+    vec_t Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = Lhs[I] >> shift;
+    }
+    return Ret;
+  }
+  friend vec_t &operator>>=(vec_t &Lhs, int shift) {
+    Lhs = Lhs >> shift;
+    return Lhs;
+  }
+
+  __SYCL_BINOP(|, |=, false, true)
+  __SYCL_BINOP(&, &=, false, true)
+  __SYCL_BINOP(^, ^=, false, true)
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+};
+#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+
+template <typename DataT, int NumElements> class vec_arith_common {
+protected:
+  using vec_t = vec<DataT, NumElements>;
+
+  // operator~() available only when: dataT != float && dataT != double
+  // && dataT != half
+  template <typename T = DataT>
+  friend std::enable_if_t<!detail::is_vgenfloat_v<T>, vec_t>
+  operator~(const vec_t &Rhs) {
+    if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
+      vec_t Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
+        detail::VecAccess<vec_t>::setValue(
+            Ret, I, ~detail::VecAccess<vec_t>::getValue(Rhs, I));
+      }
+      return Ret;
+    } else {
+      vec_t Ret{(typename vec_t::DataType) ~Rhs.m_Data};
+      if constexpr (std::is_same_v<DataT, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
+    }
+  }
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+};
+
+#undef __SYCL_BINOP
+#undef BINOP_BASE
+
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp
index f1bf7fcfcc24d..f70db78e7959a 100644
--- a/sycl/include/sycl/vector_preview.hpp
+++ b/sycl/include/sycl/vector_preview.hpp
@@ -39,6 +39,7 @@
 #include <sycl/detail/memcpy.hpp>              // for memcpy
 #include <sycl/detail/type_list.hpp>           // for is_contained
 #include <sycl/detail/type_traits.hpp>         // for is_floating_point
+#include <sycl/detail/vector_arith.hpp>
 #include <sycl/detail/vector_convert.hpp>      // for convertImpl
 #include <sycl/detail/vector_traits.hpp>       // for vector_alignment
 #include <sycl/half_type.hpp>                  // for StorageT, half, Vec16...
@@ -161,18 +162,6 @@ class SwizzleOp;
 
 template <typename T, int N, typename V = void> struct VecStorage;
 
-// Element type for relational operator return value.
-template <typename DataT>
-using rel_t = typename std::conditional_t<
-    sizeof(DataT) == sizeof(opencl::cl_char), opencl::cl_char,
-    typename std::conditional_t<
-        sizeof(DataT) == sizeof(opencl::cl_short), opencl::cl_short,
-        typename std::conditional_t<
-            sizeof(DataT) == sizeof(opencl::cl_int), opencl::cl_int,
-            typename std::conditional_t<sizeof(DataT) ==
-                                            sizeof(opencl::cl_long),
-                                        opencl::cl_long, bool>>>>;
-
 // Special type indicating that SwizzleOp should just read value from vector -
 // not trying to perform any operations. Should not be called.
 template <typename T> class GetOp {
@@ -346,6 +335,26 @@ __SYCL_DEFINE_BF16_VECSTORAGE(4)
 __SYCL_DEFINE_BF16_VECSTORAGE(8)
 __SYCL_DEFINE_BF16_VECSTORAGE(16)
 #undef __SYCL_DEFINE_BF16_VECSTORAGE
+
+// FIXME: Remove this class after eliminating setValue() and getValue()
+// dependencies from math operations on sycl::vec.
+// This class is a friend of sycl::vec and exposes getValue/setValue
+// that are used by sycl::vec math operations.
+template <typename VecT> class VecAccess {
+public:
+  template <typename DataT = typename VecT::element_type, int N = VecT::size()>
+  constexpr static void setValue(VecT &v, int Index, const DataT &Value) {
+    if (N == 1)
+      v.setValue(Index, Value, 0);
+    else
+      v.setValue(Index, Value, 0.f);
+  }
+
+  template <typename DataT = typename VecT::element_type, int N = VecT::size()>
+  static DataT getValue(VecT v, int Index) {
+    return (N == 1) ? v.getValue(Index, 0) : v.getValue(Index, 0.f);
+  }
+};
 } // namespace detail
 
 template <typename T> using vec_data = detail::vec_helper<T>;
@@ -358,7 +367,8 @@ using vec_data_t = typename detail::vec_helper<T>::RetType;
 /// SYCL devices as well as in host C++ code.
 ///
 /// \ingroup sycl_api
-template <typename Type, int NumElements> class vec {
+template <typename Type, int NumElements>
+class vec : public detail::vec_arith<Type, NumElements> {
   using DataT = Type;
 
   // This represent type of underlying value. There should be only one field
@@ -983,278 +993,6 @@ template <typename Type, int NumElements> class vec {
     }
   }
 
-#ifdef __SYCL_BINOP
-#error "Undefine __SYCL_BINOP macro"
-#endif
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
-    vec Ret;                                                                   \
-    if constexpr (IsUsingArrayOnDevice) {                                      \
-      for (size_t I = 0; I < NumElements; ++I) {                               \
-        Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I)));              \
-      }                                                                        \
-    } else {                                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-      if constexpr (std::is_same_v<Type, bool> && CONVERT) {                   \
-        Ret.ConvertToDataT();                                                  \
-      }                                                                        \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
-    return Lhs BINOP vec(Rhs);                                                 \
-  }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
-    return vec(Lhs) BINOP Rhs;                                                 \
-  }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
-    Lhs = Lhs BINOP Rhs;                                                       \
-    return Lhs;                                                                \
-  }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
-    Lhs = Lhs BINOP vec(Rhs);                                                  \
-    return Lhs;                                                                \
-  }
-
-#else // __SYCL_USE_EXT_VECTOR_TYPE__
-
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
-    vec Ret{};                                                                 \
-    if constexpr (NativeVec)                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-    else                                                                       \
-      for (size_t I = 0; I < NumElements; ++I)                                 \
-        Ret.setValue(I, (DataT)(vec_data<DataT>::get(Lhs.getValue(             \
-                            I)) BINOP vec_data<DataT>::get(Rhs.getValue(I)))); \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
-    return Lhs BINOP vec(Rhs);                                                 \
-  }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
-    return vec(Lhs) BINOP Rhs;                                                 \
-  }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
-    Lhs = Lhs BINOP Rhs;                                                       \
-    return Lhs;                                                                \
-  }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
-    Lhs = Lhs BINOP vec(Rhs);                                                  \
-    return Lhs;                                                                \
-  }
-
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  __SYCL_BINOP(+, +=, true)
-  __SYCL_BINOP(-, -=, true)
-  __SYCL_BINOP(*, *=, false)
-  __SYCL_BINOP(/, /=, false)
-
-  // TODO: The following OPs are available only when: DataT != cl_float &&
-  // DataT != cl_double && DataT != cl_half
-  __SYCL_BINOP(%, %=, false)
-  __SYCL_BINOP(|, |=, false)
-  __SYCL_BINOP(&, &=, false)
-  __SYCL_BINOP(^, ^=, false)
-  __SYCL_BINOP(>>, >>=, false)
-  __SYCL_BINOP(<<, <<=, true)
-#undef __SYCL_BINOP
-#undef __SYCL_BINOP_HELP
-
-  // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic.
-  // As far as CTS validation is concerned, 0/-1 logic also applies when
-  // NumElements is equal to one, which is somewhat inconsistent with being
-  // transparent with scalar data.
-  // TODO: Determine if vec<, NumElements=1> is needed at all, remove this
-  // inconsistency if not by disallowing one-element vectors (as in OpenCL)
-
-#ifdef __SYCL_RELLOGOP
-#error "Undefine __SYCL_RELLOGOP macro"
-#endif
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
-    vec<rel_t, NumElements> Ret{};                                             \
-    /* This special case is needed since there are no standard operator||   */ \
-    /* or operator&& functions for std::array.                              */ \
-    if constexpr (IsUsingArrayOnDevice &&                                      \
-                  (std::string_view(#RELLOGOP) == "||" ||                      \
-                   std::string_view(#RELLOGOP) == "&&")) {                     \
-      for (size_t I = 0; I < NumElements; ++I) {                               \
-        /* We cannot use SetValue here as the operator is not a friend of*/    \
-        /* Ret on Windows. */                                                  \
-        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
-            Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I)))); \
-      }                                                                        \
-    } else {                                                                   \
-      Ret = vec<rel_t, NumElements>(                                           \
-          (typename vec<rel_t, NumElements>::vector_t)(                        \
-              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
-      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
-        Ret *= -1;                                                             \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
-    return Lhs RELLOGOP vec(Rhs);                                              \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
-    return vec(Lhs) RELLOGOP Rhs;                                              \
-  }
-
-#else
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
-    vec<rel_t, NumElements> Ret{};                                             \
-    for (size_t I = 0; I < NumElements; ++I) {                                 \
-      /* We cannot use SetValue here as the operator is not a friend of*/      \
-      /* Ret on Windows. */                                                    \
-      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
-          Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
-    return Lhs RELLOGOP vec(Rhs);                                              \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
-    return vec(Lhs) RELLOGOP Rhs;                                              \
-  }
-#endif
-
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
-#undef __SYCL_RELLOGOP
-
-#ifdef __SYCL_UOP
-#error "Undefine __SYCL_UOP macro"
-#endif
-#define __SYCL_UOP(UOP, OPASSIGN)                                              \
-  friend vec &operator UOP(vec & Rhs) {                                        \
-    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
-    return Rhs;                                                                \
-  }                                                                            \
-  friend vec operator UOP(vec &Lhs, int) {                                     \
-    vec Ret(Lhs);                                                              \
-    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
-    return Ret;                                                                \
-  }
-
-  __SYCL_UOP(++, +=)
-  __SYCL_UOP(--, -=)
-#undef __SYCL_UOP
-
-  // operator~() available only when: dataT != float && dataT != double
-  // && dataT != half
-  friend vec operator~(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-        Ret.setValue(I, ~Rhs.getValue(I));
-      }
-      return Ret;
-    } else {
-      vec Ret{(typename vec::DataType) ~Rhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
-    }
-  }
-
-  // operator!
-  friend vec<detail::rel_t<DataT>, NumElements> operator!(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        // std::byte neither supports ! unary op or casting, so special handling
-        // is needed. And, worse, Windows has a conflict with 'byte'.
-        if constexpr (std::is_same_v<std::byte, DataT>) {
-          Ret.setValue(I, std::byte{!vec_data<DataT>::get(Rhs.getValue(I))});
-        } else
-#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        {
-          Ret.setValue(I, !vec_data<DataT>::get(Rhs.getValue(I)));
-        }
-      }
-      return Ret.template as<vec<detail::rel_t<DataT>, NumElements>>();
-    } else {
-      return vec{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
-          .template as<vec<detail::rel_t<DataT>, NumElements>>();
-    }
-  }
-
-  // operator +
-  friend vec operator+(const vec &Lhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(+vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
-    } else {
-      return vec{+Lhs.m_Data};
-    }
-  }
-
-  // operator -
-  friend vec operator-(const vec &Lhs) {
-    namespace oneapi = sycl::ext::oneapi;
-    vec Ret{};
-    if constexpr (IsBfloat16 && NumElements == 1) {
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
-      oneapi::bfloat16 w = -v;
-      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
-    } else if constexpr (IsBfloat16) {
-      for (size_t I = 0; I < NumElements; I++) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
-        oneapi::bfloat16 w = -v;
-        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
-      }
-    } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(-vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
-    } else {
-      Ret = vec{-Lhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
-    }
-  }
-
-  // OP is: &&, ||
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
-
-  // OP is: ==, !=, <, >, <=, >=
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
 private:
   // Generic method that execute "Operation" on underlying values.
 
@@ -1364,7 +1102,6 @@ template <typename Type, int NumElements> class vec {
   }
 
   // fields
-
   // Alignment is the same as size, to a maximum size of 64.
   // detail::vector_alignment will return that value.
   alignas(detail::vector_alignment<DataT, NumElements>::value) DataType m_Data;
@@ -1374,6 +1111,10 @@ template <typename Type, int NumElements> class vec {
             int... T5>
   friend class detail::SwizzleOp;
   template <typename T1, int T2> friend class vec;
+  // To allow arithmetic operators access private members of vec.
+  template <typename T1, int T2> friend class detail::vec_arith;
+  template <typename T1, int T2> friend class detail::vec_arith_common;
+  template <typename T1> friend class detail::VecAccess;
 };
 ///////////////////////// class sycl::vec /////////////////////////
 
@@ -1661,7 +1402,7 @@ class SwizzleOp {
 
   template <typename T = DataT>
   friend typename std::enable_if_t<
-      std::is_same_v<T, DataT> && std::is_integral_v<vec_data_t<T>>, vec_t>
+      std::is_same_v<T, DataT> && !detail::is_vgenfloat_v<T>, vec_t>
   operator~(const SwizzleOp &Rhs) {
     vec_t Tmp = Rhs;
     return ~Tmp;
@@ -1688,34 +1429,57 @@ class SwizzleOp {
 #ifdef __SYCL_BINOP
 #error "Undefine __SYCL_BINOP macro"
 #endif
-#define __SYCL_BINOP(BINOP)                                                    \
-  friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) {        \
+#define __SYCL_BINOP(BINOP, COND)                                              \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(                       \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const SwizzleOp & Lhs, \
+                                                        const DataT & Rhs) {   \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(                       \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const SwizzleOp & Lhs, \
+                                                        const vec_t & Rhs) {   \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }
 
-  __SYCL_BINOP(+)
-  __SYCL_BINOP(-)
-  __SYCL_BINOP(*)
-  __SYCL_BINOP(/)
-  __SYCL_BINOP(%)
-  __SYCL_BINOP(&)
-  __SYCL_BINOP(|)
-  __SYCL_BINOP(^)
-  __SYCL_BINOP(>>)
-  __SYCL_BINOP(<<)
+  __SYCL_BINOP(+, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(-, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(*, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(/, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(%, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(&, true)
+  __SYCL_BINOP(|, true)
+  __SYCL_BINOP(^, true)
+  // We have special <<, >> operators for std::byte.
+  __SYCL_BINOP(>>, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(<<, (!detail::is_byte_v<T>))
+
+  template <typename T = DataT>
+  friend std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator>>(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp >> shift;
+  }
+
+  template <typename T = DataT>
+  friend std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator<<(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp << shift;
+  }
 #undef __SYCL_BINOP
 
 // scalar RELLOGOP vec<>
@@ -1724,33 +1488,40 @@ class SwizzleOp {
 #ifdef __SYCL_RELLOGOP
 #error "Undefine __SYCL_RELLOGOP macro"
 #endif
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const SwizzleOp & Lhs, const DataT & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const SwizzleOp & Lhs, const vec_t & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }
 
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
+  __SYCL_RELLOGOP(==, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(!=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(&&, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
 #undef __SYCL_RELLOGOP
 
   template <int IdxNum = getNumElements(),
diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp
index f724841332aa1..3d1c372f79837 100644
--- a/sycl/test-e2e/Basic/vector/byte.cpp
+++ b/sycl/test-e2e/Basic/vector/byte.cpp
@@ -182,6 +182,7 @@ int main() {
     // std::byte is not an arithmetic type or a character type, so std::byte
     // and vec<std::byte> should not support artithmetic operations. In the
     // new implementation of vec<> class, the following will be removed.
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
     {
       // binary op for 2 vec
       auto vop = VecByte3A + VecByte3B;
@@ -352,6 +353,35 @@ int main() {
 
       auto bitv2 = !VecByte4A;
     }
+#else
+    {
+      // std::byte is not an arithmetic type and it only supports the following
+      // overloads of >> and << operators.
+      //
+      // 1 template <class IntegerType>
+      //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+      //   noexcept;
+      // 2 template <class IntegerType>
+      //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+      //   noexcept;
+      auto VecByte3Shift = VecByte3A << 3;
+      assert(VecByte3Shift[0] == VecByte3A[0] << 3 &&
+             VecByte3Shift[1] == VecByte3A[1] << 3 &&
+             VecByte3Shift[2] == VecByte3A[2] << 3);
+
+      VecByte3Shift = VecByte3A >> 1;
+      assert(VecByte3Shift[0] == VecByte3A[0] >> 1 &&
+             VecByte3Shift[1] == VecByte3A[1] >> 1 &&
+             VecByte3Shift[2] == VecByte3A[2] >> 1);
+
+      auto SwizByte2Shift = VecByte4A.lo();
+      using VecType = sycl::vec<std::byte, 2>;
+      auto SwizShiftRight = (VecType)(SwizByte2Shift >> 3);
+      auto SwizShiftLeft = (VecType)(SwizByte2Shift << 3);
+      assert(SwizShiftRight[0] == SwizByte2Shift[0] >> 3 &&
+             SwizShiftLeft[1] == SwizByte2Shift[1] << 3);
+    }
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
   }
 
   return 0;

From 2e1f14adb3bf6d9e9c55e4b0ced9e1ece2172a4a Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Sat, 8 Jun 2024 08:33:52 -0700
Subject: [PATCH 51/55] [SYCL] Fix UB and alignment issues in the SYCL default
 sorter (#13975)

Currently `std::byte*` scratch pointer is not aligned and
`reinterpret_cast`ed as `T*` where type `T` may have alignment
requirement different from `byte*`, this is UB.

As a solution, use `std::align` to align the required buffer in the
scratch and use placement `new` so that dynamic type of the buffer in
the scratch will be `T*`.
---
 sycl/include/sycl/detail/group_sort_impl.hpp  | 59 ++++++-------------
 .../experimental/group_helpers_sorters.hpp    | 48 ++++++++++++---
 2 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp
index af060edbbdc4c..d42c41890d59d 100644
--- a/sycl/include/sycl/detail/group_sort_impl.hpp
+++ b/sycl/include/sycl/detail/group_sort_impl.hpp
@@ -68,22 +68,10 @@ struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>> {
   using type = ElementType;
 };
 
-// since we couldn't assign data to raw memory, it's better to use placement
-// for first assignment
-template <typename Acc, typename T>
-void set_value(Acc ptr, const size_t idx, const T &val, bool is_first) {
-  if (is_first) {
-    ::new (ptr + idx) T(val);
-  } else {
-    ptr[idx] = val;
-  }
-}
-
 template <typename InAcc, typename OutAcc, typename Compare>
 void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
            const size_t start_1, const size_t end_1, const size_t end_2,
-           const size_t start_out, Compare comp, const size_t chunk,
-           bool is_first) {
+           const size_t start_out, Compare comp, const size_t chunk) {
   const size_t start_2 = end_1;
   // Borders of the sequences to merge within this call
   const size_t local_start_1 =
@@ -111,8 +99,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
     const size_t l_shift_1 = local_start_1 - start_1;
     const size_t l_shift_2 = l_search_bound_2 - start_2;
 
-    set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_1,
-              is_first);
+    out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
 
     size_t r_search_bound_2{};
     // find right border in 2nd sequence
@@ -123,8 +110,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const auto r_shift_1 = local_end_1 - 1 - start_1;
       const auto r_shift_2 = r_search_bound_2 - start_2;
 
-      set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_1,
-                is_first);
+      out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
     }
 
     // Handle intermediate items
@@ -138,8 +124,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t shift_1 = idx - start_1;
       const size_t shift_2 = l_search_bound_2 - start_2;
 
-      set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_1,
-                is_first);
+      out_acc1[start_out + shift_1 + shift_2] = intermediate_item_1;
     }
   }
   // Process 2nd sequence
@@ -152,8 +137,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
     const size_t l_shift_1 = l_search_bound_1 - start_1;
     const size_t l_shift_2 = local_start_2 - start_2;
 
-    set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_2,
-              is_first);
+    out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
 
     size_t r_search_bound_1{};
     // find right border in 1st sequence
@@ -164,8 +148,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t r_shift_1 = r_search_bound_1 - start_1;
       const size_t r_shift_2 = local_end_2 - 1 - start_2;
 
-      set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_2,
-                is_first);
+      out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
     }
 
     // Handle intermediate items
@@ -179,8 +162,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t shift_1 = l_search_bound_1 - start_1;
       const size_t shift_2 = idx - start_2;
 
-      set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_2,
-                is_first);
+      out_acc1[start_out + shift_1 + shift_2] = intermediate_item_2;
     }
   }
 }
@@ -200,10 +182,9 @@ void bubble_sort(Iter first, const size_t begin, const size_t end,
   }
 }
 
-template <typename Group, typename Iter, typename Compare>
+template <typename Group, typename Iter, typename T, typename Compare>
 void merge_sort(Group group, Iter first, const size_t n, Compare comp,
-                std::byte *scratch) {
-  using T = typename GetValueType<Iter>::type;
+                T *scratch) {
   const size_t idx = group.get_local_linear_id();
   const size_t local = group.get_local_range().size();
   const size_t chunk = (n - 1) / local + 1;
@@ -212,9 +193,7 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp,
   bubble_sort(first, idx * chunk, sycl::min((idx + 1) * chunk, n), comp);
   sycl::group_barrier(group);
 
-  T *temp = reinterpret_cast<T *>(scratch);
-  bool data_in_temp = false;
-  bool is_first = true;
+  bool data_in_scratch = false;
   size_t sorted_size = 1;
   while (sorted_size * chunk < n) {
     const size_t start_1 =
@@ -223,26 +202,24 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp,
     const size_t end_2 = sycl::min(end_1 + sorted_size * chunk, n);
     const size_t offset = chunk * (idx % sorted_size);
 
-    if (!data_in_temp) {
-      merge(offset, first, temp, start_1, end_1, end_2, start_1, comp, chunk,
-            is_first);
+    if (!data_in_scratch) {
+      merge(offset, first, scratch, start_1, end_1, end_2, start_1, comp,
+            chunk);
     } else {
-      merge(offset, temp, first, start_1, end_1, end_2, start_1, comp, chunk,
-            /*is_first*/ false);
+      merge(offset, scratch, first, start_1, end_1, end_2, start_1, comp,
+            chunk);
     }
     sycl::group_barrier(group);
 
-    data_in_temp = !data_in_temp;
+    data_in_scratch = !data_in_scratch;
     sorted_size *= 2;
-    if (is_first)
-      is_first = false;
   }
 
   // copy back if data is in a temporary storage
-  if (data_in_temp) {
+  if (data_in_scratch) {
     for (size_t i = 0; i < chunk; ++i) {
       if (idx * chunk + i < n) {
-        first[idx * chunk + i] = temp[idx * chunk + i];
+        first[idx * chunk + i] = scratch[idx * chunk + i];
       }
     }
     sycl::group_barrier(group);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
index d33502b7e3f24..82fa4ff53e234 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
@@ -63,10 +63,27 @@ template <typename Compare = std::less<>> class default_sorter {
   void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
                   [[maybe_unused]] Ptr last) {
 #ifdef __SYCL_DEVICE_ONLY__
+    // Adjust the scratch pointer based on alignment of the type T.
     // Per extension specification if scratch size is less than the value
     // returned by memory_required then behavior is undefined, so we don't check
     // that the scratch size statisfies the requirement.
-    sycl::detail::merge_sort(g, first, last - first, comp, scratch.data());
+    using T = typename sycl::detail::GetValueType<Ptr>::type;
+    T *scratch_begin = nullptr;
+    size_t n = last - first;
+    // We must have a barrier here before array placement new because it is
+    // possible that scratch memory is already in use, so we need to synchronize
+    // work items.
+    sycl::group_barrier(g);
+    if (g.leader()) {
+      void *scratch_ptr = scratch.data();
+      size_t space = scratch.size();
+      scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space);
+      scratch_begin = ::new (scratch_ptr) T[n];
+    }
+    // Broadcast leader's pointer (the beginning of the scratch) to all work
+    // items in the group.
+    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
 #else
     throw sycl::exception(
         std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
@@ -77,16 +94,33 @@ template <typename Compare = std::less<>> class default_sorter {
   template <typename Group, typename T>
   T operator()([[maybe_unused]] Group g, T val) {
 #ifdef __SYCL_DEVICE_ONLY__
+    // Adjust the scratch pointer based on alignment of the type T.
     // Per extension specification if scratch size is less than the value
     // returned by memory_required then behavior is undefined, so we don't check
     // that the scratch size statisfies the requirement.
+    T *scratch_begin = nullptr;
+    std::size_t local_id = g.get_local_linear_id();
     auto range_size = g.get_local_range().size();
-    size_t local_id = g.get_local_linear_id();
-    T *temp = reinterpret_cast<T *>(scratch.data());
-    ::new (temp + local_id) T(val);
-    sycl::detail::merge_sort(g, temp, range_size, comp,
-                             scratch.data() + range_size * sizeof(T));
-    val = temp[local_id];
+    // We must have a barrier here before array placement new because it is
+    // possible that scratch memory is already in use, so we need to synchronize
+    // work items.
+    sycl::group_barrier(g);
+    if (g.leader()) {
+      void *scratch_ptr = scratch.data();
+      size_t space = scratch.size();
+      scratch_ptr =
+          std::align(alignof(T), /* output storage and temporary storage */ 2 *
+                                     range_size * sizeof(T),
+                     scratch_ptr, space);
+      scratch_begin = ::new (scratch_ptr) T[2 * range_size];
+    }
+    // Broadcast leader's pointer (the beginning of the scratch) to all work
+    // items in the group.
+    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    scratch_begin[local_id] = val;
+    sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
+                             scratch_begin + range_size);
+    val = scratch_begin[local_id];
 #else
     throw sycl::exception(
         std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),

From 51a061d09013bc058a2270061aa3bfea3612cc0e Mon Sep 17 00:00:00 2001
From: David Garcia Orozco <david.garcia.orozco@intel.com>
Date: Sat, 8 Jun 2024 09:43:54 -0600
Subject: [PATCH 52/55] [SYCL][E2E] Remove use of deprecated exceptions in USM
 e2e tests (#14098)

---
 sycl/test-e2e/USM/memcpy.cpp        | 2 +-
 sycl/test-e2e/USM/memset.cpp        | 2 +-
 sycl/test-e2e/USM/pointer_query.cpp | 2 +-
 sycl/test-e2e/USM/queue_wait.cpp    | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/test-e2e/USM/memcpy.cpp b/sycl/test-e2e/USM/memcpy.cpp
index 57739533239f7..fc1029964103b 100644
--- a/sycl/test-e2e/USM/memcpy.cpp
+++ b/sycl/test-e2e/USM/memcpy.cpp
@@ -86,7 +86,7 @@ void check_on_device(queue q, int *arr) {
           [&](handler &cgh) { cgh.memcpy(nullptr, ARR, sizeof(int) * N); });   \
       q.wait_and_throw();                                                      \
       assert(false && "Expected error from copying to nullptr");               \
-    } catch (runtime_error e) {                                                \
+    } catch (exception e) {                                                    \
     }                                                                          \
     /* Copying to nullptr should throw. */                                     \
     q.submit([&](handler &cgh) { cgh.memcpy(nullptr, ARR, 0); });              \
diff --git a/sycl/test-e2e/USM/memset.cpp b/sycl/test-e2e/USM/memset.cpp
index ff0d597e85036..dcd201677be5e 100644
--- a/sycl/test-e2e/USM/memset.cpp
+++ b/sycl/test-e2e/USM/memset.cpp
@@ -127,7 +127,7 @@ int main() {
     q.submit([&](handler &cgh) { cgh.memset(nullptr, 0, N * sizeof(char)); });
     q.wait_and_throw();
     assert(false && "Expected error from writing to nullptr");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
 
   // Filling to nullptr is skipped if the number of bytes to fill is 0.
diff --git a/sycl/test-e2e/USM/pointer_query.cpp b/sycl/test-e2e/USM/pointer_query.cpp
index c8c66cd2cfdb3..84282d4859ba8 100644
--- a/sycl/test-e2e/USM/pointer_query.cpp
+++ b/sycl/test-e2e/USM/pointer_query.cpp
@@ -90,7 +90,7 @@ int main() {
   }
   try {
     D = get_pointer_device(array, ctxt);
-  } catch (runtime_error) {
+  } catch (exception) {
     free(array);
     return 0;
   }
diff --git a/sycl/test-e2e/USM/queue_wait.cpp b/sycl/test-e2e/USM/queue_wait.cpp
index 0f0ebbf02304b..0aa3d375b4120 100644
--- a/sycl/test-e2e/USM/queue_wait.cpp
+++ b/sycl/test-e2e/USM/queue_wait.cpp
@@ -36,13 +36,13 @@ int main() {
     Q.memset(nullptr, 42, Size);
     Q.wait_and_throw();
     assert(false && "Expected to have an exception throw instead of assert");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
   try {
     Q.memcpy(nullptr, DevArr, Size);
     Q.wait_and_throw();
     assert(false && "Expected to have an exception throw instead of assert");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
 
   Q.memset(nullptr, 42, 0);

From d6a18e1893b31fb2c67fcd1858961891a658aae7 Mon Sep 17 00:00:00 2001
From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Date: Mon, 10 Jun 2024 07:20:12 +0100
Subject: [PATCH 53/55] [SYCL][COMPAT] Fix memory_management_test3 (#14080)

This change is intended to fix a CI failure on the OpenCL backend for
the `memory_management_test3`. This is because the memory is allocated
on a different q to the one it is released on.
---
 sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
index fa0b0c1d1ed26..c456cda333fed 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
@@ -29,8 +29,6 @@
 //
 //
 // ===----------------------------------------------------------------------===//
-// https://github.com/intel/llvm/issues/14086
-// UNSUPPORTED: gpu-intel-gen12 && linux
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
@@ -69,8 +67,7 @@ void test_free_memory_q() {
 void test_wait_and_free_memory() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
+  float *d_A = (float *)syclcompat::malloc(sizeof(float));
   syclcompat::wait_and_free((void *)d_A);
 
   syclcompat::wait_and_free(0);

From a35f862445b5666c63469cda2656b0a9946df25c Mon Sep 17 00:00:00 2001
From: Guo Yejun <yejun.guo@intel.com>
Date: Mon, 10 Jun 2024 14:26:46 +0800
Subject: [PATCH 54/55] [SYCL][Graph] fix the address pointer in graph print
 (#13595)

the address pointer should be the USM pointer
---
 sycl/source/detail/graph_impl.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index 758d5903af311..80837181ec056 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -24,6 +24,7 @@
 #include <deque>
 #include <fstream>
 #include <functional>
+#include <iomanip>
 #include <list>
 #include <set>
 #include <shared_mutex>
@@ -618,6 +619,17 @@ class node_impl {
           } else if (Arg.MType ==
                      sycl::detail::kernel_param_kind_t::kind_pointer) {
             Type = "Pointer";
+            auto Fill = Stream.fill();
+            Stream << i << ") Type: " << Type << " Ptr: " << Arg.MPtr << "(0x"
+                   << std::hex << std::setfill('0');
+            for (int i = Arg.MSize - 1; i >= 0; --i) {
+              Stream << std::setw(2)
+                     << static_cast<int16_t>(
+                            (static_cast<unsigned char *>(Arg.MPtr))[i]);
+            }
+            Stream.fill(Fill);
+            Stream << std::dec << ")\\n";
+            continue;
           } else if (Arg.MType == sycl::detail::kernel_param_kind_t::
                                       kind_specialization_constants_buffer) {
             Type = "Specialization Constants Buffer";

From 4e7f2dc1ded639df85358a691b987a6ef7ff10d0 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 30 May 2024 12:56:46 +0100
Subject: [PATCH 55/55] [SYCL][Graph] 3D kernel update regression test

Add an E2E regression test for updating kernel nodes
with 3 dimensions. Test contains a graph with two nodes,
the first node with an NDRange containing a user specified
local size, and the second node containing a Range with
implementation determined local size.
---
 .../Update/update_with_indices_ptr_3D.cpp     | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp

diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp
new file mode 100644
index 0000000000000..5459eb42de8d4
--- /dev/null
+++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp
@@ -0,0 +1,83 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+
+// Tests updating a 3D ND-Range graph kernel node using index-based explicit
+// update
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  const range<3> GlobalWorkSize(1, 2, 2);
+  const range<3> LocalWorkSize(1, 2, 2);
+  const size_t N = GlobalWorkSize[0] * GlobalWorkSize[1] * GlobalWorkSize[2];
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(N, Queue);
+  int *PtrB = malloc_device<int>(N, Queue);
+
+  std::vector<int> HostDataA(N);
+  std::vector<int> HostDataB(N);
+
+  Queue.memset(PtrA, 0, N * sizeof(int)).wait();
+  Queue.memset(PtrB, 0, N * sizeof(int)).wait();
+
+  exp_ext::dynamic_parameter DynParam(Graph, PtrA);
+
+  nd_range<3> NDRange{GlobalWorkSize, LocalWorkSize};
+  auto NodeA = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, DynParam);
+    // TODO: Use the free function kernel extension instead of regular kernels
+    // when available.
+    cgh.parallel_for(NDRange, [=](nd_item<3> Item) {
+      size_t GlobalID = Item.get_global_linear_id();
+      PtrA[GlobalID] = GlobalID;
+    });
+  });
+
+  range<3> Range{GlobalWorkSize};
+  auto NodeB = Graph.add(
+      [&](handler &cgh) {
+        cgh.set_arg(0, DynParam);
+        // TODO: Use the free function kernel extension instead of regular
+        // kernels when available.
+        cgh.parallel_for(Range, [=](item<3> Item) {
+          size_t GlobalID = Item.get_linear_id();
+          PtrA[GlobalID] *= 2;
+        });
+      },
+      exp_ext::property::node::depends_on{NodeA});
+
+  auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
+
+  // PtrA should be filled with values
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), N).wait();
+  Queue.copy(PtrB, HostDataB.data(), N).wait();
+  for (size_t i = 0; i < N; i++) {
+    assert(HostDataA[i] == (i * 2));
+    assert(HostDataB[i] == 0);
+  }
+
+  // Swap PtrB to be the input/output
+  DynParam.update(PtrB);
+  ExecGraph.update({NodeA, NodeB});
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), N).wait();
+  Queue.copy(PtrB, HostDataB.data(), N).wait();
+  for (size_t i = 0; i < N; i++) {
+    const size_t Ref = i * 2;
+    assert(HostDataA[i] == Ref);
+    assert(HostDataB[i] == Ref);
+  }
+  return 0;
+}