From b0d92ffe66c18a97f0e6fc39df043a73f60a937d Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 7 Feb 2024 08:41:29 -0800
Subject: [PATCH 01/26] [SYCL] Split sycl::clamp implementation in two (#12637)

SYCL2020, revision 8 defines two separate clamp functions - once in
integer functions and once in common functions. Follow the same in the
implementation so that clamp's handling is uniform with other
common/integer functions.
---
 .../sycl/detail/builtins/common_functions.inc | 25 +++----------------
 .../detail/builtins/integer_functions.inc     |  9 +++++++
 sycl/source/builtins/common_functions.cpp     | 12 ++-------
 sycl/source/builtins/integer_functions.cpp    |  4 +++
 4 files changed, 19 insertions(+), 31 deletions(-)
diff --git a/sycl/include/sycl/detail/builtins/common_functions.inc b/sycl/include/sycl/detail/builtins/common_functions.inc
index fb10964934cf0..022cab78e51db 100644
--- a/sycl/include/sycl/detail/builtins/common_functions.inc
+++ b/sycl/include/sycl/detail/builtins/common_functions.inc
@@ -72,32 +72,15 @@ min(T x, detail::get_elem_type_t<T> y) {
              detail::simplify_if_swizzle_t<T>{y});
 }
 
-#undef BUILTIN_COMMON
-
-#ifdef __SYCL_DEVICE_ONLY__
-DEVICE_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t,
-                     [](auto... xs) {
-                       using ElemTy = detail::get_elem_type_t<T0>;
-                       if constexpr (std::is_integral_v<ElemTy>) {
-                         if constexpr (std::is_signed_v<ElemTy>) {
-                           return __spirv_ocl_s_clamp(xs...);
-                         } else {
-                           return __spirv_ocl_u_clamp(xs...);
-                         }
-                       } else {
-                         return __spirv_ocl_fclamp(xs...);
-                       }
-                     })
-#else
-HOST_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, common,
-                   default_ret_type)
-#endif
+BUILTIN_COMMON(THREE_ARGS, clamp, __spirv_ocl_fclamp)
 template <typename T>
-detail::builtin_enable_generic_non_scalar_t<T>
+detail::builtin_enable_common_non_scalar_t<T>
 clamp(T x, detail::get_elem_type_t<T> y, detail::get_elem_type_t<T> z) {
   return clamp(detail::simplify_if_swizzle_t<T>{x},
                detail::simplify_if_swizzle_t<T>{y},
                detail::simplify_if_swizzle_t<T>{z});
 }
+
+#undef BUILTIN_COMMON
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc
index dfeb815e52494..455abf07aa250 100644
--- a/sycl/include/sycl/detail/builtins/integer_functions.inc
+++ b/sycl/include/sycl/detail/builtins/integer_functions.inc
@@ -122,6 +122,15 @@ min(T x, detail::get_elem_type_t<T> y) {
              detail::simplify_if_swizzle_t<T>{y});
 }
 
+BUILTIN_GENINT_SU(THREE_ARGS, clamp)
+template <typename T>
+detail::builtin_enable_integer_non_scalar_t<T>
+clamp(T x, detail::get_elem_type_t<T> y, detail::get_elem_type_t<T> z) {
+  return clamp(detail::simplify_if_swizzle_t<T>{x},
+               detail::simplify_if_swizzle_t<T>{y},
+               detail::simplify_if_swizzle_t<T>{z});
+}
+
 BUILTIN_GENINT(ONE_ARG, clz)
 BUILTIN_GENINT(ONE_ARG, ctz)
 BUILTIN_GENINT(ONE_ARG, popcount)
diff --git a/sycl/source/builtins/common_functions.cpp b/sycl/source/builtins/common_functions.cpp
index 09742649ca24d..30d0645f2b517 100644
--- a/sycl/source/builtins/common_functions.cpp
+++ b/sycl/source/builtins/common_functions.cpp
@@ -63,16 +63,8 @@ BUILTIN_COMMON(TWO_ARGS, max,
 BUILTIN_COMMON(TWO_ARGS, min,
                [](auto x, auto y) -> decltype(x) { return (y < x ? y : x); })
 
-// clamp is implemented for INTEGER_TYPES as well, so expand/inline
-// BUILTIN_COMMON manually.
-HOST_IMPL(clamp, [](auto x, auto y, auto z) -> decltype(x) {
-  using ElemTy = detail::get_elem_type_t<decltype(x)>;
-  if constexpr (std::is_integral_v<ElemTy>) {
-    return std::min(std::max(x, y), z);
-  } else {
-    return std::fmin(std::fmax(x, y), z);
-  }
+BUILTIN_COMMON(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) {
+  return std::fmin(std::fmax(x, y), z);
 })
-EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, clamp, INTEGER_TYPES, FP_TYPES)
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp
index 26c4dd9a5788f..381d6f1fa0a10 100644
--- a/sycl/source/builtins/integer_functions.cpp
+++ b/sycl/source/builtins/integer_functions.cpp
@@ -214,6 +214,10 @@ BUILTIN_GENINT_SU(TWO_ARGS, max,
 BUILTIN_GENINT_SU(TWO_ARGS, min,
                   [](auto x, auto y) -> decltype(x) { return y < x ? y : x; })
 
+BUILTIN_GENINT_SU(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) {
+  return std::min(std::max(x, y), z);
+})
+
 template <typename T> static inline constexpr T __clz_impl(T x, T m, T n = 0) {
   return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n);
 }

From e53e5d599454fe3775b360da92b0f774f2ea523d Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 7 Feb 2024 09:31:01 -0800
Subject: [PATCH 02/26] [SYCL][Doc] Fix broken link in forward_progress
 (#12652)

sycl_ext_oneapi_launch_queries was originally part of
sycl_ext_oneapi_forward_progress. When we split things into two
extensions, we forgot to update one of the links.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../proposed/sycl_ext_oneapi_forward_progress.asciidoc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
index 4c1df1640d021..227b23f1ffdac 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
@@ -20,7 +20,7 @@
 == Notice
 
 [%hardbreaks]
-Copyright (C) 2022-2023 Intel Corporation.  All rights reserved.
+Copyright (C) 2022-2024 Intel Corporation.  All rights reserved.
 
 Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
 of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
@@ -383,9 +383,11 @@ and work-groups to also provide concurrent forward progress guarantees). In
 such a case, an implementation must satisfy the strongest request(s).
 
 Devices may not be able to provide the requested forward progress guarantees
-for all launch configurations.  The <<launch, launch queries>> defined in a
-later section allow developers to identify valid launch configurations for
-specific combinations of properties.
+for all launch configurations. Developers should use the launch queries defined
+by the
+link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[sycl_ext_oneapi_launch_queries]
+extension to identify valid launch configurations for specific combinations of
+properties.
 
 [NOTE]
 ====

From 96179391391caca00b4712fae950cd75af551ff2 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Wed, 7 Feb 2024 10:19:18 -0800
Subject: [PATCH 03/26] [SYCL][ESIMD]Implement slm_scatter accepting compile
 time properties (#12591)

This implements the unified memory API for slm_scatter with local memory
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 216 +++++++++++++++--
 .../ext/intel/experimental/esimd/memory.hpp   |  16 +-
 .../unified_memory_api/Inputs/scatter.hpp     | 219 ++++++++++++++++++
 .../ESIMD/unified_memory_api/slm_scatter.cpp  |  33 +++
 .../slm_scatter_dg2_pvc.cpp                   |  37 +++
 sycl/test/esimd/memory_properties.cpp         |  56 +++++
 6 files changed, 539 insertions(+), 38 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index d923821027339..7f4833017d38a 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2685,11 +2685,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
 namespace detail {
 template <typename T, int N, typename AccessorTy>
 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
-    (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
-    (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
-     is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
+    std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
+    is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
              uint32_t glob_offset, simd_mask<N> mask) {
+
+  static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
+                "Unexpected type or vector length");
   constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
   // TODO (performance) use hardware-supported scale once BE supports it
   constexpr int16_t scale = 0;
@@ -2820,10 +2822,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 /// @return is a vector of type T and size N * NElts.
 ///
 template <typename T, int NElts, lsc_data_size DS, int N>
-__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
-slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
-                __ESIMD_NS::simd_mask<N> pred,
-                __ESIMD_NS::simd<T, N * NElts> pass_thru) {
+__ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
+                                               simd_mask<N> pred,
+                                               simd<T, N * NElts> pass_thru) {
   check_lsc_vector_size<NElts>();
   check_lsc_data_size<T, DS>();
   constexpr uint16_t AddressScale = 1;
@@ -2832,9 +2833,8 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
-      lsc_format_input<MsgT>(pass_thru);
-  __ESIMD_NS::simd<MsgT, N * NElts> Result =
+  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
+  simd<MsgT, N * NElts> Result =
       __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
                                  AddressScale, ImmOffset, EDS, LSCVS,
                                  Transposed, N>(pred.data(), offsets.data(),
@@ -2842,6 +2842,37 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   return lsc_format_ret<T>(Result);
 }
 
+/// SLM scatter implementation.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.slm
+///
+/// Scatters elements located to slm.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam N is the number of channels (platform dependent).
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, int N>
+__ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
+                                  simd<T, N * NElts> vals, simd_mask<N> pred) {
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
+  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
+                        ImmOffset, EDS, LSCVS, Transposed, N>(
+      pred.data(), offsets.data(), Tmp.data());
+}
+
 } // namespace detail
 
 /// @endcond ESIMD_DETAIL
@@ -3903,7 +3934,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
   static_assert(Alignment >= sizeof(T),
                 "slm_gather() requires at least element-size alignment");
 
-  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
+  if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
                            !detail::isMaskedGatherScatterLLVMAvailable())) {
     simd<T, N> PassThru; // Intentionally undefined
     return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
@@ -4118,7 +4149,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// @param props The optional compile-time properties. Only 'alignment'
-/// and cache hint properties are used.
+/// property is used.
 /// @return A vector of elements read.
 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
           typename PropertyListT =
@@ -4141,17 +4172,156 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
   return Res[0];
 }
 
-/// Scatter operation over the Shared Local Memory.
-/// This API has almost the same interface as the @ref accessor_scatter
-/// "accessor-based scatter", except that it does not have the accessor and
-/// the global offset parameters.
-///
-template <typename T, int N>
-__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
-                             (sizeof(T) <= 4)>
-slm_scatter(simd<uint32_t, N> offsets, simd<T, N> vals, simd_mask<N> mask = 1) {
-  detail::LocalAccessorMarker acc;
-  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                  simd<T, N> vals, simd_mask<N / VS> mask,
+///                  PropertyListT props = {});                   // (slm-sc-1)
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, PropertyListT props = {});  // (slm-sc-2)
+///
+/// The next 2 functions are variations of the first 2 above (slm-sc-1,2)
+/// and were added only to support simd_view instead of simd for byte_offsets.
+/// template <typename T, int N, int VS = 1, typename OffsetObjT,
+///           typename OffsetRegionT, typename PropertyListT = empty_props_t>
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
+///             simd<T, N> vals, simd_mask<N / VS> mask,
+///             PropertyListT props = {});                         // (slm-sc-3)
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
+///             simd<T, N> vals, PropertyListT props = {});        // (slm-sc-4)
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                   // (slm-sc-1)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets. Storage of any element
+/// can be disabled via the input vector of predicates \p mask.
+/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment' property
+/// is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
+
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "slm_scatter() requires at least element-size alignment");
+
+  // Use LSC lowering if VS > 1.
+  if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) {
+    __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
+        byte_offsets, vals, mask);
+  } else {
+    detail::LocalAccessorMarker acc;
+    detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
+  }
+}
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+///                   PropertyListT props = {});                   // (slm-sc-2)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_props_t>
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// Storage to any element's memory location can be disabled via the
+/// input vector of predicates \p mask. If mask[i] is unset, then the storage to
+/// (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
+}
+
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             PropertyListT props = {});                         // (slm-sc-4)
+/// Loads ("gathers") elements of the type 'T' from Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets, and returns the loaded
+/// elements.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
 }
 
 /// Store a scalar value into the Shared Local Memory.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 7fb0ae833e793..9d5054b4392ba 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1424,21 +1424,7 @@ template <typename T, int NElts = 1,
 __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
                                  __ESIMD_NS::simd<T, N * NElts> vals,
                                  __ESIMD_NS::simd_mask<N> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
-  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
-                        _ImmOffset, _DS, _VS, _Transposed, N>(
-      pred.data(), offsets.data(), Tmp.data());
+  __ESIMD_DNS::slm_scatter_impl<T, NElts, DS>(offsets, vals, pred);
 }
 
 /// Transposed SLM scatter with 1 channel.
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index 755c436f0b329..8ea1fcf4a08ad 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -230,3 +230,222 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
 
   return Passed;
 }
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testSLM(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  constexpr uint32_t Groups = 8;
+  constexpr uint32_t Threads = 1;
+  constexpr size_t Size = Groups * Threads * N;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "SLM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), Q));
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         uint16_t LocalID = ndi.get_local_id(0);
+         uint32_t GlobalElemOffset = GlobalID * N;
+         uint32_t LocalElemOffset = LocalID * N;
+
+         constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
+         slm_init<SLMSize>();
+
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<T, 8> InVec(Out + GlobalElemOffset + I);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             slm_scatter<T>(Offsets, InVec);
+           }
+         }
+         barrier();
+
+         simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                              VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+
+         simd<T, N> Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
+         Vals *= 2;
+
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView);
+             }
+           }
+         }
+         barrier();
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I++) {
+             simd<uint32_t, 1> Offsets(I * sizeof(T), sizeof(T));
+             simd<T, 1> OutVec = slm_gather<T>(Offsets);
+             OutVec.copy_to(Out + GlobalElemOffset + I);
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask);
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testSLM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // // Test scatter() without passing compile-time properties argument.
+  Passed &= testSLM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{alignment<sizeof(T)>};
+    Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testSLM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testSLM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
new file mode 100644
index 0000000000000..96c49fcdae8af
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
@@ -0,0 +1,33 @@
+//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting
+// optional compile-time esimd::properties.
+// The scatter() calls in this test do not use DG2/PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
new file mode 100644
index 0000000000000..4ebe20e376cf1
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
@@ -0,0 +1,37 @@
+//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting
+// optional compile-time esimd::properties.
+// The slm_scatter() calls in this test use DG2 or PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+  Passed &= testSLM<uint64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 3f653adb5e476..eb629935347db 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1302,6 +1302,7 @@ test_slm_gather_scatter(int byte_offset32) {
   simd<float, 32> slm;
   simd<float, 32> pass_thru;
   auto pass_thru_view = pass_thru.select<32, 1>();
+  auto slm_view = slm.select<32, 1>();
 
   // Test SLM gather using this plan:
   // 1) slm_gather(offsets): offsets is simd or simd_view
@@ -1373,4 +1374,59 @@ test_slm_gather_scatter(int byte_offset32) {
                                  props_align4);
   slm = slm_gather<float, 32, 2>(ioffset_n16_view, mask_n16, pass_thru_view,
                                  props_align4);
+
+  // Test SLM scatter using this plan:
+  // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view
+  // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view
+  // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1.
+
+  // 1) slm_scatter(offsets): offsets is simd or simd_view
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm);
+  slm_scatter<float, 32>(ioffset_n32_view, slm);
+  slm_scatter<float, 32>(ioffset_n32, slm_view);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view);
+
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, props_align8);
+
+  // 2) slm_gather(offsets, mask): offsets is simd or simd_view
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32);
+
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32, props_align8);
+
+  // 4) slm_gather(...): same as (1), (2), above, but with VS > 1.
+  // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0)
+  // 4a) check VS > 1. no 'mask' operand first.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, props_align4);
+
+  // 4b) check VS > 1. Pass the 'mask' operand this time.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16, props_align4);
 }

From 557df1aa57c810732f1635abb85045f407c5d0f3 Mon Sep 17 00:00:00 2001
From: Nikita Kornev <nikita.kornev@intel.com>
Date: Wed, 7 Feb 2024 19:26:21 +0100
Subject: [PATCH 04/26] [SYCL][DOC] Extend sycl_ext_oneapi_root_group (#12643)

Adding new enum "execution_scope" and few member functions

---------

Co-authored-by: John Pennycook <john.pennycook@intel.com>
---
 .../sycl_ext_oneapi_root_group.asciidoc       | 96 ++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
index 0ef2126d86fa1..44eb1b376d113 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
@@ -182,7 +182,8 @@ inline constexpr use_root_sync_key::value_t use_root_sync;
 === The `root_group` class
 
 The `root_group` class implements all member functions common to the
-`sycl::group` and `sycl::sub_group` classes.
+`sycl::group` and `sycl::sub_group` classes and also contains own
+additional functions.
 
 [source,c++]
 ----
@@ -191,6 +192,13 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
+enum class execution_scope {
+  work_item,
+  sub_group,
+  work_group,
+  root_group,
+};
+
 template <int Dimensions>
 class root_group {
 public:
@@ -221,6 +229,31 @@ public:
 
   bool leader() const;
 
+  template <execution_scope Scope>
+  std::enable_if_t<(Scope == execution_scope::work_item ||
+                    Scope == execution_scope::work_group),
+                   id<Dimensions>>
+  get_id() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<Scope == execution_scope::sub_group, id<1>> get_id() const;
+
+  template <execution_scope Scope>
+  size_t get_linear_id() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<(Scope == execution_scope::work_item ||
+                    Scope == execution_scope::work_group),
+                   range<Dimensions>>
+  get_range() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<Scope == execution_scope::sub_group, range<1>>
+  get_range() const;
+
+  template <execution_scope Scope>
+  size_t get_linear_range() const;
+
 };
 
 } // namespace experimental
@@ -307,6 +340,67 @@ work-item is the leader of the root-group, and `false` for all other work-items
 in the root-group. The leader of the root-group is guaranteed to be the
 work-item for which `get_local_id()` returns 0.
 
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<(Scope == execution_scope::work_item ||
+                  Scope == execution_scope::work_group),
+                 id<Dimensions>>
+get_id() const;
+----
+_Returns_: An `id` representing the index of the current work-group or work-item at `Scope`
+hierarchy level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<Scope == execution_scope::sub_group, id<1>> get_id() const;
+----
+_Returns_: An `id` representing the index of the current sub-group within the
+`root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+size_t get_linear_id() const;
+----
+_Constraints_: `Scope` must be narrower than
+`execution_scope::root_group`.
+
+_Returns_: A linearized number of the current work-group or work-item at `Scope` hierarchy
+level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<(Scope == execution_scope::work_item ||
+                  Scope == execution_scope::work_group),
+                 range<Dimensions>>
+get_range() const;
+----
+_Returns_: A `range` representing the number of work-groups or work-items of `Scope`
+hierarchy level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<Scope == execution_scope::sub_group, range<1>>
+get_range() const;
+----
+_Returns_: A `range` representing the number of sub-groups within the `root_group`
+object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+size_t get_linear_range() const;
+----
+_Constraints_: `Scope` must be narrower than
+`execution_scope::root_group`.
+
+_Returns_: The number of work-groups or work-items of `Scope` hierarchy level within the
+`root_group` object.
+
 
 === Using a `root_group`
 

From 7bacc52cd2e14955cdd80784aa9d6961c39d6cfc Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 7 Feb 2024 20:06:14 +0000
Subject: [PATCH 05/26] [SYCL][ESIMD][E2E] Fix aot_mixed.cpp (#12650)

Arg was wrong. Manually tested this. CI is current broken because of
this,

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 sycl/test-e2e/ESIMD/aot_mixed.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/aot_mixed.cpp b/sycl/test-e2e/ESIMD/aot_mixed.cpp
index 32c10969ee51b..5204d413e69ad 100644
--- a/sycl/test-e2e/ESIMD/aot_mixed.cpp
+++ b/sycl/test-e2e/ESIMD/aot_mixed.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 // TODO: Enable on other GPUs once internal ticket is fixed
 // REQUIRES: ocloc && gpu-intel-gen12
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s
 // RUN: %{run} %t.sycl.out
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.out %s
 // RUN: %{run} %t.out
 
 // This test checks the following ESIMD ahead-of-time compilation scenarios:

From 5f1d98a6e9da8065e8c211479113385ff16cd71b Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Wed, 7 Feb 2024 15:04:47 -0600
Subject: [PATCH 06/26] [ESIMD][E2E] Enable few cases in atomic_update after
 gpu driver update (#12639)

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 .../Inputs/atomic_update.hpp                  | 16 ++++----------
 .../Inputs/atomic_update_slm.hpp              | 22 +++----------------
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
index 3da82ee8e15d1..ef8646ae4892a 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
@@ -715,13 +715,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
         q, cfg);
     passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
         q, cfg);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
-    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
-    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
-#endif
+    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
+        q, cfg);
+    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
+        q, cfg);
   }
 
   return passed;
@@ -742,13 +739,8 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) {
     passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
 
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-#endif
   }
   return passed;
 }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
index 92f000711e137..6b69d99158cb6 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -612,14 +612,9 @@ bool test_fp_types(queue q) {
 
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
-    // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
-    if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
-                  !std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
-                  !std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
-      if (q.get_device().has(sycl::aspect::atomic64) &&
-          q.get_device().has(sycl::aspect::fp64)) {
-        passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
-      }
+    if (q.get_device().has(sycl::aspect::atomic64) &&
+        q.get_device().has(sycl::aspect::fp64)) {
+      passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
     }
   }
   return passed;
@@ -633,7 +628,6 @@ bool test_int_types_and_sizes(queue q) {
   passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
-  // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
   if (UseMask && Features == TestFeatures::Generic &&
       esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
                                 "26918", "101.4953", false)) {
@@ -645,13 +639,8 @@ bool test_int_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q);
     passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q);
-#endif
   }
 
   return passed;
@@ -672,13 +661,8 @@ bool test_fp_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q);
     passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q);
-#endif
   }
   return passed;
 }

From 8185f30da27161289487b664a3bb1df37ce647f9 Mon Sep 17 00:00:00 2001
From: lbushi25 <113361374+lbushi25@users.noreply.github.com>
Date: Thu, 8 Feb 2024 03:32:34 -0500
Subject: [PATCH 07/26] Rewrite tests that fail when usm_shared_allocations not
 supported. (#12636)

Nearly 40 E2E tests use malloc_shared and do not expect an exception to
be thrown in case of failure which is not what the spec says, namely,
malloc_shared should throw when the usm_shared_allocations aspect is not
supported by the device. At the moment malloc_shared simply returns
nullptr, however, once the implementation of malloc_shared is changed to
throw an exception these tests will produce failures.

This PR is the first of a series of PR's to rewrite these tests so that
they don't fail once malloc_shared is changed to conform to the spec.
These changes will only affect devices that do not support the relevant
aspect. This is done either:

- by having llvm-lit require that the aspect is supported for tests that
rely heavily on malloc_shared,
- by disabling, through if statements, sections of code that use
malloc_shared for tests that use it sparsely,
- by completely rewriting the malloc_shared code with equivalent
buffer/accessor code that is guaranteed to work on all platforms. The
latter approach is taken on tests that do not have the purpose of
testing malloc_shared but simply use it to allocate memory for the test.

These changes aim to eliminate test failures while maintaining test
coverage.
Subsequent PR's will omit this description and refer to this PR instead.
---
 .../Annotated_arg_ptr/annotated_arg.cpp       |   2 +-
 .../Annotated_arg_ptr/annotated_ptr.cpp       |   2 +-
 .../Annotated_usm/annotated_usm_kind.cpp      |  35 +++---
 sycl/test-e2e/Basic/group_local_memory.cpp    |  26 ++--
 sycl/test-e2e/Basic/large-range.cpp           |  46 +++----
 sycl/test-e2e/Basic/span.cpp                  |   2 +-
 sycl/test-e2e/Basic/wrapped_usm_pointers.cpp  |   2 +-
 .../Complex/sycl_complex_math_test.cpp        | 112 ++++++++----------
 .../Complex/sycl_complex_operator_test.cpp    |  79 ++++++------
 .../Complex/sycl_complex_pow_test.cpp         |  65 +++++-----
 .../Complex/sycl_complex_stream_test.cpp      |   8 +-
 11 files changed, 185 insertions(+), 194 deletions(-)

diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
index 8fad623752873..ea572bfc4c5d7 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
+// REQUIRES: aspect-usm_shared_allocations
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
index 7e4a35d09b994..c478fd96afb34 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
+// REQUIRES: aspect-usm_shared_allocations
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
index 812a0bb115737..8be1db398a1a1 100644
--- a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
@@ -124,26 +124,27 @@ template <typename T> void testUsmKind(sycl::queue &q) {
           [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); },
           [&]() { return ATHost(1, q); }, [&]() { return ATHost(1, Ctx); },
           [&]() { return ATAnnotated(1, dev, Ctx, alloc::host); }});
-
-  CheckUsmKindAll(
-      alloc::shared,
-      std::tuple{
-          [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); },
-          [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
-          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
-          [&]() { return AShared(1, q); },
-          [&]() { return AShared(1, dev, Ctx); },
-          [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); },
-          [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); },
-          [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
-          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
-          [&]() { return ATShared(1, q); },
-          [&]() { return ATShared(1, dev, Ctx); },
-          [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }});
+  if (dev.has(sycl::aspect::usm_shared_allocations)) {
+    CheckUsmKindAll(
+        alloc::shared,
+        std::tuple{
+            [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); },
+            [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
+            [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+            [&]() { return AShared(1, q); },
+            [&]() { return AShared(1, dev, Ctx); },
+            [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); },
+            [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); },
+            [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
+            [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+            [&]() { return ATShared(1, q); },
+            [&]() { return ATShared(1, dev, Ctx); },
+            [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }});
+  }
 }
 
 int main() {
   sycl::queue q;
   testUsmKind<int>(q);
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sycl/test-e2e/Basic/group_local_memory.cpp b/sycl/test-e2e/Basic/group_local_memory.cpp
index 8185378071c75..b5bb26917ec0d 100644
--- a/sycl/test-e2e/Basic/group_local_memory.cpp
+++ b/sycl/test-e2e/Basic/group_local_memory.cpp
@@ -7,17 +7,21 @@ constexpr int N = 5;
 
 int main() {
   sycl::queue q;
-  int *ptr = sycl::malloc_shared<int>(N, q);
-  q.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) {
-     auto g = it.get_group();
-     auto mem = sycl::ext::oneapi::group_local_memory<int[N]>(g, 1, 2, 3, 4, 5);
-     auto ref = *mem;
-     for (int i = 0; i < N; ++i) {
-       ptr[i] = ref[i];
-     }
-   }).wait();
+  sycl::buffer<int> buf{sycl::range{N}};
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor acc{buf, h};
+    h.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) {
+      auto g = it.get_group();
+      auto mem =
+          sycl::ext::oneapi::group_local_memory<int[N]>(g, 1, 2, 3, 4, 5);
+      auto ref = *mem;
+      for (int i = 0; i < N; ++i) {
+        acc[i] = ref[i];
+      }
+    });
+  });
+  sycl::host_accessor result{buf};
   for (int i = 0; i < N; ++i) {
-    assert(ptr[i] == (i + 1));
+    assert(result[i] == (i + 1));
   }
-  sycl::free(ptr, q);
 }
diff --git a/sycl/test-e2e/Basic/large-range.cpp b/sycl/test-e2e/Basic/large-range.cpp
index 354276065a739..f2c38cbb5dd8d 100644
--- a/sycl/test-e2e/Basic/large-range.cpp
+++ b/sycl/test-e2e/Basic/large-range.cpp
@@ -26,15 +26,18 @@ void check_sum(std::string_view desc, const ContainerT &data, size_t N) {
 template <typename RangeT>
 void test_regular(std::string_view desc, queue &q, size_t B, RangeT range) {
   auto N = range.size();
-  std::vector accumulators_v(B, 0, usm_allocator<int, usm::alloc::shared>(q));
-  auto *accumulators = accumulators_v.data();
-
-  q.parallel_for(range, [=](auto it) {
-     atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
-         accumulators[it.get_linear_id() % B]);
-     ++ref;
-   }).wait();
-
+  std::vector accumulators_v(B, 0);
+  {
+    sycl::buffer accumulator_buf{accumulators_v};
+    q.submit([&](sycl::handler &h) {
+      sycl::accessor accumulators{accumulator_buf, h};
+      h.parallel_for(range, [=](auto it) {
+        atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
+            accumulators[it.get_linear_id() % B]);
+        ++ref;
+      });
+    });
+  } // destruction of accumulator_buf here writes back data to accumulators_v
   check_sum(desc, accumulators_v, N);
 }
 
@@ -42,18 +45,19 @@ template <typename RangeT>
 void test_spec_constant(std::string_view desc, queue &q, size_t B,
                         RangeT range) {
   auto N = range.size();
-  std::vector accumulators_v(B, 0, usm_allocator<int, usm::alloc::shared>(q));
-  auto *accumulators = accumulators_v.data();
-
-  q.submit([&](handler &cgh) {
-     cgh.set_specialization_constant<C>(2);
-     cgh.parallel_for(range, [=](auto it, kernel_handler h) {
-       atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
-           accumulators[it.get_linear_id() % B]);
-       ref += h.get_specialization_constant<C>();
-     });
-   }).wait();
-
+  std::vector accumulators_v(B, 0);
+  {
+    sycl::buffer accumulators_buf{accumulators_v};
+    q.submit([&](handler &cgh) {
+      sycl::accessor accumulators{accumulators_buf, cgh};
+      cgh.set_specialization_constant<C>(2);
+      cgh.parallel_for(range, [=](auto it, kernel_handler h) {
+        atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
+            accumulators[it.get_linear_id() % B]);
+        ref += h.get_specialization_constant<C>();
+      });
+    });
+  } // destruction of accumulators_buf here writes data back to accumulators_v
   check_sum(desc, accumulators_v, N * 2);
 }
 
diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp
index 43d16ff4d16d6..ec341d39d0eac 100644
--- a/sycl/test-e2e/Basic/span.cpp
+++ b/sycl/test-e2e/Basic/span.cpp
@@ -3,7 +3,7 @@
 //
 // Fails to release USM pointer on HIP for NVIDIA
 // XFAIL: hip_nvidia
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <numeric>
 #include <sycl/sycl.hpp>
 
diff --git a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
index 7b81493db6dce..d2fabb6f7e967 100644
--- a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
+++ b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 //==---------- wrapped_usm_pointer.cpp - test pointers in struct ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
index 9bb5122ffc7f7..1178bdf0b5ffe 100644
--- a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
@@ -13,31 +13,26 @@
     bool operator()(sycl::queue &Q, cmplx<T> init,                             \
                     cmplx<T> ref = cmplx<T>(0, 0), bool use_ref = false) {     \
       bool pass = true;                                                        \
-                                                                               \
       auto std_in = init_std_complex(init.re, init.im);                        \
       experimental::complex<T> cplx_input{init.re, init.im};                   \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       /*Get std::complex output*/                                              \
       std::complex<T> std_out{ref.re, ref.im};                                 \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<T>(cplx_input);                 \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<T>(cplx_input); });  \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<T>(cplx_input);                    \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<T>(cplx_input);                \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -73,8 +68,7 @@ TEST_MATH_OP_TYPE(tanh)
                                                                                \
       auto std_in = init_std_complex(init.re, init.im);                        \
       experimental::complex<T> cplx_input{init.re, init.im};                   \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<T>(1, Q);                           \
+      sycl::buffer<T> cplx_out_buf{sycl::range{1}};                            \
                                                                                \
       /*Get std::complex output*/                                              \
       T std_out = ref.re;                                                      \
@@ -82,19 +76,18 @@ TEST_MATH_OP_TYPE(tanh)
         std_out = std::math_func(std_in);                                      \
                                                                                \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<T>(cplx_input);                 \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<T>(cplx_input); });  \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<T>(cplx_input);                    \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<T>(cplx_input);                \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -121,23 +114,21 @@ TEST_MATH_OP_TYPE(imag)
       std::complex<T> std_out = ref;                                           \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<X>(std_in);                     \
-       }).wait();                                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<X>(std_in); });      \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<X>(std_in);                        \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<X>(std_in);                    \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -161,23 +152,21 @@ TEST_MATH_OP_TYPE(proj)
       T std_out = ref;                                                         \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<T>(1, Q);                           \
-                                                                               \
+      sycl::buffer<T> cplx_out_buf{sycl::range{1}};                            \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<X>(init);                       \
-       }).wait();                                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<X>(std_in); });      \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<X>(init);                          \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<X>(init);                      \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -197,26 +186,25 @@ template <typename T> struct test_polar {
                   bool use_ref = false) {
     bool pass = true;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range(1)};
     /*Get std::complex output*/
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::polar(init.re, init.im);
 
     /*Check cplx::complex output from device*/
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::polar<T>(init.re, init.im);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::polar<T>(init.re, init.im); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     /*Check cplx::complex output from host*/
-    cplx_out[0] = experimental::polar<T>(init.re, init.im);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::polar<T>(init.re, init.im);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
diff --git a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
index ed68332f60bdd..8894f5d965462 100644
--- a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
@@ -15,23 +15,19 @@
       experimental::complex<T> cplx_input1{init_re1, init_im1};                \
       experimental::complex<T> cplx_input2{init_re2, init_im2};                \
                                                                                \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       std::complex<T> std_out;                                                 \
       std_out = std_in1 op std_in2;                                            \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] = cplx_input1 op cplx_input2; });    \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = cplx_input1 op cplx_input2;                             \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
-                                                                               \
-      cplx_out[0] = cplx_input1 op cplx_input2;                                \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = cplx_input1 op cplx_input2;                            \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -53,28 +49,32 @@ test_op(test_div, /);
       experimental::complex<T> cplx_input{init_re1, init_im1};                 \
                                                                                \
       auto std_inout = init_std_complex(init_re2, init_im2);                   \
-      auto *cplx_inout = sycl::malloc_shared<experimental::complex<T>>(1, Q);  \
-      cplx_inout[0].real(init_re2);                                            \
-      cplx_inout[0].imag(init_im2);                                            \
-                                                                               \
+      experimental::complex<T> cplx;                                           \
+      cplx.real(init_re2);                                                     \
+      cplx.imag(init_im2);                                                     \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{&cplx,               \
+                                                          sycl::range{1}};     \
       std_inout op_assign std_in;                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] op_assign cplx_input; });            \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      Q.single_task([=]() { cplx_inout[0] op_assign cplx_input; }).wait();     \
+      pass &=                                                                  \
+          check_results(cplx_out_acc[0],                                       \
+                        std::complex<T>(std_inout.real(), std_inout.imag()),   \
+                        /*is_device*/ true);                                   \
                                                                                \
-      pass &= check_results(                                                   \
-          cplx_inout[0], std::complex<T>(std_inout.real(), std_inout.imag()),  \
-          /*is_device*/ true);                                                 \
+      cplx_out_acc[0].real(init_re2);                                          \
+      cplx_out_acc[0].imag(init_im2);                                          \
                                                                                \
-      cplx_inout[0].real(init_re2);                                            \
-      cplx_inout[0].imag(init_im2);                                            \
+      cplx_out_acc[0] op_assign cplx_input;                                    \
                                                                                \
-      cplx_inout[0] op_assign cplx_input;                                      \
-                                                                               \
-      pass &= check_results(                                                   \
-          cplx_inout[0], std::complex<T>(std_inout.real(), std_inout.imag()),  \
-          /*is_device*/ false);                                                \
-                                                                               \
-      sycl::free(cplx_inout, Q);                                               \
+      pass &=                                                                  \
+          check_results(cplx_out_acc[0],                                       \
+                        std::complex<T>(std_inout.real(), std_inout.imag()),   \
+                        /*is_device*/ false);                                  \
                                                                                \
       return pass;                                                             \
     }                                                                          \
@@ -99,19 +99,18 @@ test_op_assign(test_div_assign, /=);
       experimental::complex<T> cplx_input{init_re1, init_im1};                 \
                                                                                \
       std::complex<T> std_out{};                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       std_out = op std_in;                                                     \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] = op cplx_input; });                 \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
-      Q.single_task([=]() { cplx_out[0] = op cplx_input; }).wait();            \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
-                                                                               \
-      cplx_out[0] = op cplx_input;                                             \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
+      cplx_out_acc[0] = op cplx_input;                                         \
                                                                                \
-      sycl::free(cplx_out, Q);                                                 \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
                                                                                \
       return pass;                                                             \
     }                                                                          \
diff --git a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
index c8555687eb49b..91d58a4cdeebb 100644
--- a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
@@ -51,26 +51,26 @@ template <typename T> struct test_pow_cplx_cplx {
     experimental::complex<T> cplx_input1{init1.re, init1.im};
     experimental::complex<T> cplx_input2{init2.re, init2.im};
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_in1, std_in2);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task([=]() {
+        cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
+      });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::pow<T>(cplx_input1, cplx_input2);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
@@ -87,26 +87,25 @@ template <typename T> struct test_pow_cplx_deci {
     experimental::complex<T> cplx_input{init1.re, init1.im};
     T deci_input = init2.re;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_in, std_deci_in);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow(cplx_input, deci_input);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::pow(cplx_input, deci_input); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow(cplx_input, deci_input);
+    cplx_out_acc[0] = experimental::pow(cplx_input, deci_input);
 
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
-
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
@@ -123,26 +122,24 @@ template <typename T> struct test_pow_deci_cplx {
     experimental::complex<T> cplx_input{init2.re, init2.im};
     T deci_input = init1.re;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_deci_in, std_in);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow(deci_input, cplx_input);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
-
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::pow(deci_input, cplx_input); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow(deci_input, cplx_input);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::pow(deci_input, cplx_input);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
diff --git a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
index b9cb8c519366d..d8d645f6ac7dc 100644
--- a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
@@ -7,17 +7,15 @@
 
 template <typename T> struct test_sycl_stream_operator {
   bool operator()(sycl::queue &Q, cmplx<T> init) {
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-    cplx_out[0] = experimental::complex<T>(init.re, init.im);
-
+    experimental::complex<T> cplx(init.re, init.im);
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{&cplx, sycl::range{1}};
     Q.submit([&](sycl::handler &CGH) {
+       sycl::accessor cplx_out{cplx_out_buf, CGH};
        sycl::stream Out(512, 20, CGH);
        CGH.parallel_for<>(sycl::range<1>(1), [=](sycl::id<1> idx) {
          Out << cplx_out[idx] << sycl::endl;
        });
      }).wait();
-
-    sycl::free(cplx_out, Q);
     return true;
   }
 };

From 96073b9ead9ac27cd4a63e7e00e65de2d39b895c Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Thu, 8 Feb 2024 11:37:22 +0100
Subject: [PATCH 08/26] [SYCL] Change vec operators to be friends (#12396)

This commit changes operators for sycl::vec to be defined like they are
in the SYCL specification, i.e. friend functions instead of members.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/include/sycl/types.hpp                   | 451 ++++++++++++------
 .../Basic/vector/vec_binary_scalar_order.cpp  | 150 ++++++
 sycl/test/basic_tests/types.cpp               |   5 +-
 3 files changed, 464 insertions(+), 142 deletions(-)
 create mode 100644 sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp

diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index a2f934a147c33..c12b9e6781eb9 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -987,6 +987,41 @@ template <typename Type, int NumElements> class vec {
 #endif
 
 #ifdef __SYCL_USE_EXT_VECTOR_TYPE__
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
+  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
+    vec Ret;                                                                   \
+    if constexpr (IsUsingArrayOnDevice) {                                      \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I)));              \
+      }                                                                        \
+    } else {                                                                   \
+      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
+      if constexpr (std::is_same_v<Type, bool> && CONVERT) {                   \
+        Ret.ConvertToDataT();                                                  \
+      }                                                                        \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
+    return Lhs BINOP vec(Rhs);                                                 \
+  }                                                                            \
+  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
+    return vec(Lhs) BINOP Rhs;                                                 \
+  }                                                                            \
+  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <int Num = NumElements>                                             \
+  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
+      vec & Lhs, const DataT & Rhs) {                                          \
+    Lhs = Lhs BINOP vec(Rhs);                                                  \
+    return Lhs;                                                                \
+  }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
   template <typename Ty = vec>                                                 \
   vec operator BINOP(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {     \
@@ -1024,38 +1059,37 @@ template <typename Type, int NumElements> class vec {
     *this = *this BINOP vec(Rhs);                                              \
     return *this;                                                              \
   }
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
 #else // __SYCL_USE_EXT_VECTOR_TYPE__
 
 #if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  vec operator BINOP(const vec &Rhs) const {                                   \
+  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
     vec Ret{};                                                                 \
     if constexpr (NativeVec)                                                   \
-      Ret.m_Data = m_Data BINOP Rhs.m_Data;                                    \
+      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
     else                                                                       \
       for (size_t I = 0; I < NumElements; ++I)                                 \
-        Ret.setValue(I, (DataT)(vec_data<DataT>::get(getValue(                 \
+        Ret.setValue(I, (DataT)(vec_data<DataT>::get(Lhs.getValue(             \
                             I)) BINOP vec_data<DataT>::get(Rhs.getValue(I)))); \
     return Ret;                                                                \
   }                                                                            \
-  template <typename T>                                                        \
-  typename std::enable_if_t<                                                   \
-      std::is_convertible_v<DataT, T> &&                                       \
-          (std::is_fundamental_v<vec_data_t<T>> ||                             \
-           detail::is_half_or_bf16_v<typename std::remove_const_t<T>>),        \
-      vec>                                                                     \
-  operator BINOP(const T & Rhs) const {                                        \
-    return *this BINOP vec(static_cast<const DataT &>(Rhs));                   \
+  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
+    return Lhs BINOP vec(Rhs);                                                 \
   }                                                                            \
-  vec &operator OPASSIGN(const vec & Rhs) {                                    \
-    *this = *this BINOP Rhs;                                                   \
-    return *this;                                                              \
+  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
+    return vec(Lhs) BINOP Rhs;                                                 \
+  }                                                                            \
+  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
   }                                                                            \
   template <int Num = NumElements>                                             \
-  typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(                \
-      const DataT & Rhs) {                                                     \
-    *this = *this BINOP vec(Rhs);                                              \
-    return *this;                                                              \
+  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
+      vec & Lhs, const DataT & Rhs) {                                          \
+    Lhs = Lhs BINOP vec(Rhs);                                                  \
+    return Lhs;                                                                \
   }
 #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
@@ -1120,6 +1154,42 @@ template <typename Type, int NumElements> class vec {
 // Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
 // by SYCL device compiler only.
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const vec & Rhs) {          \
+    vec<rel_t, NumElements> Ret{};                                             \
+    /* This special case is needed since there are no standard operator||   */ \
+    /* or operator&& functions for std::array.                              */ \
+    if constexpr (IsUsingArrayOnDevice &&                                      \
+                  (std::string_view(#RELLOGOP) == "||" ||                      \
+                   std::string_view(#RELLOGOP) == "&&")) {                     \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        /* We cannot use SetValue here as the operator is not a friend of*/    \
+        /* Ret on Windows. */                                                  \
+        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
+            Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I)))); \
+      }                                                                        \
+    } else {                                                                   \
+      Ret = vec<rel_t, NumElements>(                                           \
+          (typename vec<rel_t, NumElements>::vector_t)(                        \
+              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
+      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
+        Ret *= -1;                                                             \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const DataT & Rhs) {        \
+    return Lhs RELLOGOP vec(Rhs);                                              \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
+                                                   const vec & Rhs) {          \
+    return vec(Lhs) RELLOGOP Rhs;                                              \
+  }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_RELLOGOP(RELLOGOP)                                              \
   vec<rel_t, NumElements> operator RELLOGOP(const vec & Rhs) const {           \
     vec<rel_t, NumElements> Ret{};                                             \
@@ -1129,9 +1199,10 @@ template <typename Type, int NumElements> class vec {
                   (std::string_view(#RELLOGOP) == "||" ||                      \
                    std::string_view(#RELLOGOP) == "&&")) {                     \
       for (size_t I = 0; I < NumElements; ++I) {                               \
-        Ret.setValue(I,                                                        \
-                     -(vec_data<DataT>::get(getValue(I))                       \
-                           RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
+        /* We cannot use SetValue here as the operator is not a friend of*/    \
+        /* Ret on Windows. */                                                  \
+        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
+            getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));     \
       }                                                                        \
     } else {                                                                   \
       Ret = vec<rel_t, NumElements>(                                           \
@@ -1150,13 +1221,38 @@ template <typename Type, int NumElements> class vec {
   operator RELLOGOP(const T & Rhs) const {                                     \
     return *this RELLOGOP vec(static_cast<const DataT &>(Rhs));                \
   }
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #else
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const vec & Rhs) {          \
+    vec<rel_t, NumElements> Ret{};                                             \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      /* We cannot use SetValue here as the operator is not a friend of*/      \
+      /* Ret on Windows. */                                                    \
+      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
+          Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const DataT & Rhs) {        \
+    return Lhs RELLOGOP vec(Rhs);                                              \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
+                                                   const vec & Rhs) {          \
+    return vec(Lhs) RELLOGOP Rhs;                                              \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_RELLOGOP(RELLOGOP)                                              \
   vec<rel_t, NumElements> operator RELLOGOP(const vec & Rhs) const {           \
     vec<rel_t, NumElements> Ret{};                                             \
     for (size_t I = 0; I < NumElements; ++I) {                                 \
-      Ret.setValue(I, -(vec_data<DataT>::get(getValue(I))                      \
-                            RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));  \
+      /* We cannot use SetValue here as the operator is not a friend of*/      \
+      /* Ret on Windows. */                                                    \
+      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
+          getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));       \
     }                                                                          \
     return Ret;                                                                \
   }                                                                            \
@@ -1168,6 +1264,7 @@ template <typename Type, int NumElements> class vec {
   operator RELLOGOP(const T & Rhs) const {                                     \
     return *this RELLOGOP vec(static_cast<const DataT &>(Rhs));                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #endif
 
   __SYCL_RELLOGOP(==)
@@ -1184,6 +1281,18 @@ template <typename Type, int NumElements> class vec {
 #ifdef __SYCL_UOP
 #error "Undefine __SYCL_UOP macro"
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend vec &operator UOP(vec & Rhs) {                                        \
+    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec operator UOP(vec &Lhs, int) {                                     \
+    vec Ret(Lhs);                                                              \
+    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Ret;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_UOP(UOP, OPASSIGN)                                              \
   vec &operator UOP() {                                                        \
     *this OPASSIGN vec_data<DataT>::get(1);                                    \
@@ -1194,6 +1303,7 @@ template <typename Type, int NumElements> class vec {
     *this OPASSIGN vec_data<DataT>::get(1);                                    \
     return Ret;                                                                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_UOP(++, +=)
   __SYCL_UOP(--, -=)
@@ -1203,150 +1313,84 @@ template <typename Type, int NumElements> class vec {
 
   // operator~() available only when: dataT != float && dataT != double
   // && dataT != half
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
-                                (!IsUsingArrayOnDevice && !IsUsingArrayOnHost),
-                            vec>
-  operator~() const {
-    vec Ret{(typename vec::DataType) ~m_Data};
-    if constexpr (std::is_same_v<Type, bool>) {
-      Ret.ConvertToDataT();
-    }
-    return Ret;
-  }
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec>
-  operator~() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I) {
-      Ret.setValue(I, ~getValue(I));
+  friend vec operator~(const vec &Rhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
+      vec Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
+        Ret.setValue(I, ~Rhs.getValue(I));
+      }
+      return Ret;
+    } else {
+      vec Ret{(typename vec::DataType) ~Rhs.m_Data};
+      if constexpr (std::is_same_v<Type, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
     }
-    return Ret;
   }
 
-  template <typename T>
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-  using OpNotRet = detail::rel_t<T>;
-#else
-  using OpNotRet = T;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-
   // operator!
-  template <typename T = DataT, int N = NumElements>
-  EnableIfNotUsingArray<vec<OpNotRet<T>, N>> operator!() const {
-    return vec<T, N>{(typename vec<DataT, NumElements>::DataType) !m_Data}
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-        .template as<vec<OpNotRet<T>, N>>();
-#else
-    ;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-
-  // std::byte neither supports ! unary op or casting, so special handling is
-  // needed. And, worse, Windows has a conflict with 'byte'.
+  friend vec<detail::rel_t<DataT>, NumElements> operator!(const vec &Rhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
+      vec Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
 #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-  template <typename T = DataT, int N = NumElements>
-  typename std::enable_if_t<std::is_same_v<std::byte, T> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec<OpNotRet<T>, N>>
-  operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I) {
-      Ret.setValue(I, std::byte{!vec_data<DataT>::get(getValue(I))});
+        // std::byte neither supports ! unary op or casting, so special handling
+        // is needed. And, worse, Windows has a conflict with 'byte'.
+        if constexpr (std::is_same_v<std::byte, DataT>) {
+          Ret.setValue(I, std::byte{!vec_data<DataT>::get(Rhs.getValue(I))});
+        } else
+#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+        {
+          Ret.setValue(I, !vec_data<DataT>::get(Rhs.getValue(I)));
+        }
+      }
+      return Ret.template as<vec<detail::rel_t<DataT>, NumElements>>();
+    } else {
+      return vec{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
+          .template as<vec<detail::rel_t<DataT>, NumElements>>();
     }
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
   }
 
-  template <typename T = DataT, int N = NumElements>
-  typename std::enable_if_t<!std::is_same_v<std::byte, T> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec<OpNotRet<T>, N>>
-  operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-#else
-  template <typename T = DataT, int N = NumElements>
-  EnableIfUsingArray<vec<OpNotRet<T>, N>> operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-#endif
-
   // operator +
-  template <typename T = vec> EnableIfNotUsingArray<T> operator+() const {
-    return vec{+m_Data};
-  }
-
-  template <typename T = vec> EnableIfUsingArray<T> operator+() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, vec_data<DataT>::get(+vec_data<DataT>::get(getValue(I))));
-    return Ret;
-  }
-
-  // operator -
-  template <typename T = vec> EnableIfNotUsingArray<T> operator-() const {
-    namespace oneapi = sycl::ext::oneapi;
-    if constexpr (IsBfloat16 && NumElements == 1) {
-      vec Ret{};
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data);
-      oneapi::bfloat16 w = -v;
-      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
-    } else if constexpr (IsBfloat16) {
+  friend vec operator+(const vec &Lhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
       vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]);
-        oneapi::bfloat16 w = -v;
-        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
-      }
+      for (size_t I = 0; I < NumElements; ++I)
+        Ret.setValue(
+            I, vec_data<DataT>::get(+vec_data<DataT>::get(Lhs.getValue(I))));
       return Ret;
     } else {
-      vec Ret{-m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
+      return vec{+Lhs.m_Data};
     }
   }
 
-  template <typename T = vec> EnableIfUsingArray<T> operator-() const {
+  // operator -
+  friend vec operator-(const vec &Lhs) {
     namespace oneapi = sycl::ext::oneapi;
     vec Ret{};
     if constexpr (IsBfloat16 && NumElements == 1) {
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data);
+      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
       oneapi::bfloat16 w = -v;
       Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
     } else if constexpr (IsBfloat16) {
       for (size_t I = 0; I < NumElements; I++) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]);
+        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
         oneapi::bfloat16 w = -v;
         Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
       }
-    } else {
+    } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
       for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(I,
-                     vec_data<DataT>::get(-vec_data<DataT>::get(getValue(I))));
+        Ret.setValue(
+            I, vec_data<DataT>::get(-vec_data<DataT>::get(Lhs.getValue(I))));
+      return Ret;
+    } else {
+      Ret = vec{-Lhs.m_Data};
+      if constexpr (std::is_same_v<Type, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
     }
-    return Ret;
   }
 #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
@@ -1770,6 +1814,19 @@ class SwizzleOp {
 #ifdef __SYCL_OPASSIGN
 #error "Undefine __SYCL_OPASSIGN macro."
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_OPASSIGN(OPASSIGN, OP)                                          \
+  friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, const DataT & Rhs) {    \
+    Lhs.operatorHelper<OP>(vec_t(Rhs));                                        \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <typename RhsOperation>                                             \
+  friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs,                         \
+                                      const RhsOperation & Rhs) {              \
+    Lhs.operatorHelper<OP>(Rhs);                                               \
+    return Lhs;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_OPASSIGN(OPASSIGN, OP)                                          \
   SwizzleOp &operator OPASSIGN(const DataT & Rhs) {                            \
     operatorHelper<OP>(vec_t(Rhs));                                            \
@@ -1780,6 +1837,7 @@ class SwizzleOp {
     operatorHelper<OP>(Rhs);                                                   \
     return *this;                                                              \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_OPASSIGN(+=, std::plus)
   __SYCL_OPASSIGN(-=, std::minus)
@@ -1796,6 +1854,18 @@ class SwizzleOp {
 #ifdef __SYCL_UOP
 #error "Undefine __SYCL_UOP macro"
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend SwizzleOp &operator UOP(SwizzleOp & Rhs) {                            \
+    Rhs OPASSIGN static_cast<DataT>(1);                                        \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec_t operator UOP(SwizzleOp &Lhs, int) {                             \
+    vec_t Ret = Lhs;                                                           \
+    Lhs OPASSIGN static_cast<DataT>(1);                                        \
+    return Ret;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_UOP(UOP, OPASSIGN)                                              \
   SwizzleOp &operator UOP() {                                                  \
     *this OPASSIGN static_cast<DataT>(1);                                      \
@@ -1806,11 +1876,36 @@ class SwizzleOp {
     *this OPASSIGN static_cast<DataT>(1);                                      \
     return Ret;                                                                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_UOP(++, +=)
   __SYCL_UOP(--, -=)
 #undef __SYCL_UOP
 
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+  template <typename T = DataT>
+  friend typename std::enable_if_t<
+      std::is_same_v<T, DataT> && std::is_integral_v<vec_data_t<T>>, vec_t>
+  operator~(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return ~Tmp;
+  }
+
+  friend vec_rel_t operator!(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return !Tmp;
+  }
+
+  friend vec_t operator+(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return +Tmp;
+  }
+
+  friend vec_t operator-(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return -Tmp;
+  }
+#else  // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
   template <typename T = DataT>
   typename std::enable_if_t<std::is_integral_v<vec_data_t<T>>, vec_t>
   operator~() {
@@ -1832,6 +1927,80 @@ class SwizzleOp {
     vec_t Tmp = *this;
     return -Tmp;
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+// scalar BINOP vec<>
+// scalar BINOP SwizzleOp
+// vec<> BINOP SwizzleOp
+#ifdef __SYCL_BINOP
+#error "Undefine __SYCL_BINOP macro"
+#endif
+#define __SYCL_BINOP(BINOP)                                                    \
+  friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) {        \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs BINOP Tmp;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) {        \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp BINOP Rhs;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) {        \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs BINOP Tmp;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) {        \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp BINOP Rhs;                                                      \
+  }
+
+  __SYCL_BINOP(+)
+  __SYCL_BINOP(-)
+  __SYCL_BINOP(*)
+  __SYCL_BINOP(/)
+  __SYCL_BINOP(%)
+  __SYCL_BINOP(&)
+  __SYCL_BINOP(|)
+  __SYCL_BINOP(^)
+  __SYCL_BINOP(>>)
+  __SYCL_BINOP(<<)
+#undef __SYCL_BINOP
+
+// scalar RELLOGOP vec<>
+// scalar RELLOGOP SwizzleOp
+// vec<> RELLOGOP SwizzleOp
+#ifdef __SYCL_RELLOGOP
+#error "Undefine __SYCL_RELLOGOP macro"
+#endif
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp RELLOGOP Rhs;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp RELLOGOP Rhs;                                                   \
+  }
+
+  __SYCL_RELLOGOP(==)
+  __SYCL_RELLOGOP(!=)
+  __SYCL_RELLOGOP(>)
+  __SYCL_RELLOGOP(<)
+  __SYCL_RELLOGOP(>=)
+  __SYCL_RELLOGOP(<=)
+  // TODO: limit to integral types.
+  __SYCL_RELLOGOP(&&)
+  __SYCL_RELLOGOP(||)
+#undef __SYCL_RELLOGOP
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   template <int IdxNum = getNumElements(),
             typename = EnableIfMultipleIndexes<IdxNum>>
@@ -2265,6 +2434,7 @@ class SwizzleOp {
 };
 } // namespace detail
 
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 // scalar BINOP vec<>
 // scalar BINOP SwizzleOp
 // vec<> BINOP SwizzleOp
@@ -2374,6 +2544,7 @@ __SYCL_RELLOGOP(<=)
 __SYCL_RELLOGOP(&&)
 __SYCL_RELLOGOP(||)
 #undef __SYCL_RELLOGOP
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
 namespace detail {
 
diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp
new file mode 100644
index 0000000000000..158b52ab5f27e
--- /dev/null
+++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp
@@ -0,0 +1,150 @@
+// REQUIRES: preview-breaking-changes-supported
+// RUN: %{build} -fpreview-breaking-changes -o %t.out
+// RUN: %{run} %t.out
+
+// This test currently fails on AMD HIP due to an unresolved memcmp function.
+// XFAIL: hip_amd
+
+// Checks scalar/vec operator ordering.
+
+#include <sycl.hpp>
+
+template <typename T>
+using rel_t = std::conditional_t<
+    sizeof(T) == 1, int8_t,
+    std::conditional_t<
+        sizeof(T) == 2, int16_t,
+        std::conditional_t<sizeof(T) == 4, int32_t,
+                           std::conditional_t<sizeof(T) == 8, int64_t, void>>>>;
+
+template <bool IsRelOp, typename T1, int N, typename T2>
+bool CheckResult(sycl::vec<T1, N> V, T2 Ref) {
+  if constexpr (IsRelOp) {
+    // Check that all elements have the same boolean representation as the
+    // scalar.
+    for (size_t I = 0; I < N; ++I)
+      if (static_cast<bool>(V[I]) != static_cast<bool>(Ref))
+        return false;
+    return true;
+  } else {
+    // Check that all elements are equal to the scalar.
+    for (size_t I = 0; I < N; ++I)
+      if (V[I] != Ref)
+        return false;
+    return true;
+  }
+}
+
+#define CHECK(Q, C, T, N, IS_RELOP, OP)                                        \
+  {                                                                            \
+    using VecT = sycl::vec<T, N>;                                              \
+    using ResT = sycl::vec<std::conditional_t<IS_RELOP, rel_t<T>, T>, N>;      \
+    constexpr T RefVal = 2;                                                    \
+    VecT InVec{static_cast<T>(RefVal)};                                        \
+    {                                                                          \
+      VecT OutVecsDevice[2];                                                   \
+      T OutRefsDevice[2];                                                      \
+      {                                                                        \
+        sycl::buffer<VecT, 1> OutVecsBuff{OutVecsDevice, 2};                   \
+        sycl::buffer<T, 1> OutRefsBuff{OutRefsDevice, 2};                      \
+        Q.submit([&](sycl::handler &CGH) {                                     \
+          sycl::accessor OutVecsAcc{OutVecsBuff, CGH, sycl::read_write};       \
+          sycl::accessor OutRefsAcc{OutRefsBuff, CGH, sycl::read_write};       \
+          CGH.single_task([=]() {                                              \
+            auto OutVec1 = InVec OP RefVal;                                    \
+            auto OutVec2 = RefVal OP InVec;                                    \
+            static_assert(std::is_same_v<decltype(OutVec1), ResT>);            \
+            static_assert(std::is_same_v<decltype(OutVec2), ResT>);            \
+            OutVecsAcc[0] = OutVec1;                                           \
+            OutVecsAcc[1] = OutVec2;                                           \
+            OutRefsAcc[0] = RefVal OP RefVal;                                  \
+            OutRefsAcc[1] = RefVal OP RefVal;                                  \
+          });                                                                  \
+        });                                                                    \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVecsDevice[0], OutRefsDevice[0])) {        \
+        std::cout << ("Check of vector " #OP                                   \
+                      " scalar from device failed for " #T " and " #N)         \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVecsDevice[1], OutRefsDevice[1])) {        \
+        std::cout << ("Check of scalar " #OP                                   \
+                      " vector from device failed for " #T " and " #N)         \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+    }                                                                          \
+    {                                                                          \
+      auto OutVec1 = InVec OP RefVal;                                          \
+      auto OutVec2 = RefVal OP InVec;                                          \
+      static_assert(std::is_same_v<decltype(OutVec1), ResT>);                  \
+      static_assert(std::is_same_v<decltype(OutVec2), ResT>);                  \
+      if (!CheckResult<IS_RELOP>(OutVec1, RefVal OP RefVal)) {                 \
+        std::cout << ("Check of vector " #OP                                   \
+                      " scalar from host failed for " #T " and " #N)           \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVec2, RefVal OP RefVal)) {                 \
+        std::cout << ("Check of scalar " #OP                                   \
+                      " vector from host failed for " #T " and " #N)           \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define CHECK_SIZES(Q, C, T, IS_RELOP, OP)                                     \
+  CHECK(Q, C, T, 1, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 2, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 4, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 8, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 16, IS_RELOP, OP)
+
+// NOTE: For the sake of compile-time we pick only a few operators per category.
+#define CHECK_SIZES_AND_COMMON_OPS(Q, C, T)                                    \
+  CHECK_SIZES(Q, Failures, T, false, *)                                        \
+  CHECK_SIZES(Q, Failures, T, true, &&)                                        \
+  CHECK_SIZES(Q, Failures, T, true, ==)                                        \
+  CHECK_SIZES(Q, Failures, T, true, <)                                         \
+  CHECK_SIZES(Q, Failures, T, true, >=)
+#define CHECK_SIZES_AND_INT_ONLY_OPS(Q, C, T)                                  \
+  CHECK_SIZES(Q, Failures, T, false, %)                                        \
+  CHECK_SIZES(Q, Failures, T, false, >>)                                       \
+  CHECK_SIZES(Q, Failures, T, false, ^)
+
+int main() {
+  sycl::queue Q;
+  int Failures = 0;
+
+  // Check operators on types with requirements if they are supported.
+  if (Q.get_device().has(sycl::aspect::fp16)) {
+    CHECK_SIZES_AND_COMMON_OPS(Q, Failures, sycl::half);
+  }
+  if (Q.get_device().has(sycl::aspect::fp64)) {
+    CHECK_SIZES_AND_COMMON_OPS(Q, Failures, double);
+  }
+
+  // Check all operators without requirements.
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, float);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int8_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int16_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int32_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int64_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint8_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint16_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint32_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint64_t);
+
+  // Check integer only operators.
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int8_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int16_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int32_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int64_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint8_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint16_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint32_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint64_t);
+  return Failures;
+}
diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp
index 6aab1e433c7a7..14a1070567274 100644
--- a/sycl/test/basic_tests/types.cpp
+++ b/sycl/test/basic_tests/types.cpp
@@ -134,12 +134,13 @@ template <typename T, typename Expected> inline void checkVecNotReturnType() {
   using Vector = sycl::vec<T, N>;
 #if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
   using ExpectedVector = sycl::vec<Expected, N>;
+  using OpNotResult = decltype(operator!(std::declval<Vector>()));
 #else
   using ExpectedVector = sycl::vec<T, N>;
-#endif
   using OpNotResult = decltype(std::declval<Vector>().operator!());
+#endif
   static_assert(std::is_same_v<OpNotResult, ExpectedVector>,
-                "Incorrect vec::operator! return type");
+                "Incorrect operator! return type");
 }
 
 // the math built-in testing ensures that the vec binary ops get tested,

From 0eac61876fce2d524f16b9b1d4239a7733f9cbbd Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Thu, 8 Feb 2024 11:59:38 +0100
Subject: [PATCH 09/26] [SYCL][Matrix] Correct Prefetch instruction usage
 (#12623)

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 sycl/include/CL/__spirv/spirv_ops.hpp                  | 9 +++++----
 sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index ea1a6580d30e6..9af5b7e75ae38 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -174,10 +174,11 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *
 __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *,
                             Ts val, size_t i);
 
-template <typename T, std::size_t NumRows, std::size_t NumCols>
-extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixPrefetchINTEL(
-    T *Ptr, std::size_t coordX, std::size_t coordY, unsigned int CacheLevel,
-    __spv::MatrixLayout Layout, std::size_t Stride);
+template <typename T>
+extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL(
+    T *Ptr, std::size_t coordX, std::size_t coordY, std::size_t NumRows,
+    std::size_t NumCols, unsigned int CacheLevel, __spv::MatrixLayout Layout,
+    std::size_t Stride);
 
 #ifndef __SPIRV_BUILTIN_DECLARATIONS__
 #error                                                                         \
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index 98aea6f04a48b..a07e9c144ba6a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -524,8 +524,9 @@ joint_matrix_prefetch(Group sg, T *Ptr, size_t stride,
   // Will be removed once SPIRV implementation also uses offsetpointer
   size_t coordX = 0;
   size_t coordY = 0;
-  __spirv_JointMatrixPrefetchINTEL<T, NumRows, NumCols>(
-      Ptr, coordX, coordY, detail::PropertyMetaInfo<decltype(prop)>::value,
+  __spirv_CooperativeMatrixPrefetchINTEL<T>(
+      Ptr, coordX, coordY, NumRows, NumCols,
+      detail::PropertyMetaInfo<decltype(prop)>::value,
       sycl::detail::joint_matrix_layout_to_spv(Layout), stride);
 #endif // defined(__NVPTX__)
 #else

From b1d81d739fda76d7c45f36799676bff2e52e1fb9 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Thu, 8 Feb 2024 11:33:22 +0000
Subject: [PATCH 10/26] [SYCL][Graph] Disable USM tests on gpu-intel-dg2
 (#12648)

After PR https://github.com/intel/llvm/pull/12279, the USM tests fail in
post-commit CIs
(https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479).
We temporarily disable these tests during the bug investigation.
---
 sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp   | 4 ++++
 sycl/test-e2e/Graph/Explicit/basic_usm.cpp                  | 4 ++++
 sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp             | 4 ++++
 sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp            | 4 ++++
 sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp           | 4 ++++
 sycl/test-e2e/Graph/Explicit/empty_node.cpp                 | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task.cpp                  | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task2.cpp                 | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task_last.cpp             | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp    | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task_single.cpp           | 4 ++++
 sycl/test-e2e/Graph/Explicit/host_task_successive.cpp       | 4 ++++
 sycl/test-e2e/Graph/Explicit/memadvise.cpp                  | 5 ++++-
 sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp       | 4 ++++
 sycl/test-e2e/Graph/Explicit/prefetch.cpp                   | 5 ++++-
 sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp      | 5 +++++
 sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp            | 4 ++++
 sycl/test-e2e/Graph/Explicit/repeated_exec.cpp              | 4 ++++
 sycl/test-e2e/Graph/Explicit/sub_graph.cpp                  | 4 ++++
 .../Graph/Explicit/sub_graph_execute_without_parent.cpp     | 4 ++++
 .../Graph/Explicit/sub_graph_multiple_submission.cpp        | 4 ++++
 sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp           | 4 ++++
 .../test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp | 4 ++++
 sycl/test-e2e/Graph/Explicit/usm_copy.cpp                   | 5 ++++-
 sycl/test-e2e/Graph/Explicit/usm_fill.cpp                   | 4 ++++
 sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp            | 4 ++++
 sycl/test-e2e/Graph/Explicit/usm_memset.cpp                 | 5 ++++-
 .../Graph/RecordReplay/add_nodes_after_finalize.cpp         | 5 ++++-
 sycl/test-e2e/Graph/RecordReplay/after_use.cpp              | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp      | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp              | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp         | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp        | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp       | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/empty_node.cpp             | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task.cpp              | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task2.cpp             | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp     | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp         | 4 ++++
 .../test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp       | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp   | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/memadvise.cpp              | 5 ++++-
 sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp   | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/prefetch.cpp               | 5 ++++-
 sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp  | 5 +++++
 sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp        | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp          | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp              | 4 ++++
 .../Graph/RecordReplay/sub_graph_execute_without_parent.cpp | 4 ++++
 .../Graph/RecordReplay/sub_graph_multiple_submission.cpp    | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp       | 4 ++++
 .../Graph/RecordReplay/sub_graph_two_parent_graphs.cpp      | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp               | 6 ++++--
 sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp               | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp        | 4 ++++
 sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp             | 5 ++++-
 sycl/test-e2e/Graph/empty_graph.cpp                         | 4 ++++
 58 files changed, 234 insertions(+), 10 deletions(-)

diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
index cf17e3b6c3dba..b39f3b4fc889b 100644
--- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
index 6e76ec42c079e..00940dcd6d68c 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
index 040ce8c30c38b..fd8fcec4b89d2 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
index bb12ff6006004..5855fc25ed4cc 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
index f989a27a1ca39..3be8febbe008e 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
index 7f7501175899c..301c53aa69795 100644
--- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp
index 167219bf4b8ef..62bcd167e484f 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task2.cpp b/sycl/test-e2e/Graph/Explicit/host_task2.cpp
index f7d59f48aac09..78be02a7bc991 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task2.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task2.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
index aed2916284cc1..26dcf148d6f7d 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
index 3d03b7f6013ad..32c7157d0958c 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
index 6fc8744e04c18..b2236b9a0d91b 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
index da26b8b182296..2147d075715af 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/memadvise.cpp b/sycl/test-e2e/Graph/Explicit/memadvise.cpp
index 11e0d6296290f..1a8313478ddb5 100644
--- a/sycl/test-e2e/Graph/Explicit/memadvise.cpp
+++ b/sycl/test-e2e/Graph/Explicit/memadvise.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // Mem advise command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Mem advise is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
index a83775b8ecb8a..e9c76ad01113b 100644
--- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/prefetch.cpp b/sycl/test-e2e/Graph/Explicit/prefetch.cpp
index 8e0e6e15c292d..4b670ebc89357 100644
--- a/sycl/test-e2e/Graph/Explicit/prefetch.cpp
+++ b/sycl/test-e2e/Graph/Explicit/prefetch.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // prefetch command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Prefetch is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
index a4f3710eea9f0..0760b5805e6d9 100644
--- a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
@@ -4,6 +4,11 @@
 // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
index a2589cda54320..85d3b106bc3cd 100644
--- a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
index 8b1f6ba3d4455..935f431b09332 100644
--- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
index 830cc5a42a668..c60aa2f63e34f 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
index bc3cb1c8f352d..772c07599300f 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
index 92add086ee20d..17cdde9b87956 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
index 05ecfbd982a01..7fd72c79722d4 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
index 8fa8e0c334de2..d43e7e221e68c 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
index 323d3e35935ac..6b60911d4627e 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
@@ -7,7 +7,10 @@
 //
 //
 // USM copy command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
index 86076768f5d32..966809cf5a8d3 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
index e0100d42d7267..53feb52cad798 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
index e9bb9ebab0fa9..50821fd8d008b 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
@@ -6,7 +6,10 @@
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
 // USM memset command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
index ba81666277e44..c0729ebb3a015 100644
--- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
@@ -7,7 +7,10 @@
 //
 //
 // Temporarily disabled until failure is addressed.
-// UNSUPPORTED: windows
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: windows, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
index 767ca8b425ab7..4a87a09e0fbce 100644
--- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // This test attempts recording a set of kernels after they have already been
 // executed once before.
diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
index c12038ba46185..951a016994261 100644
--- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #include "../graph_common.hpp"
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
index ffddf70716250..bcdfb702b78c7 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
index a90931934f0ce..b71fd0a690220 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
index 410850f5ff7a8..74bc18a7b3496 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
index 2f19c2706380b..51e4597354473 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
index 70128db712f78..2155b226e2fc3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
index 582a3fb3f14ff..4f4d50fe3b3f3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
index f947a13ffd63d..cd867eb4caadd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
index e40fe3a0e963c..c3037c78d8eaf 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // This test uses a host_task when adding a command_graph node to an
 // in-order queue.
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
index c8dfa02c63ea7..64d625197f089 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
index 7e312df58d092..a8ff9b53637cd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
index 6d86cef96c862..4e2730592ef95 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
index 51da588ab9444..e5570cb61a2d3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
index a5b87f8143832..1f7989f9a0361 100644
--- a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // Mem advise command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Mem advise is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
index 67b5335de4383..0709d25e225bd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
index dca297f7772b3..a5a18c08c6b66 100644
--- a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // prefetch command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Prefetch is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
index aa81923251cb6..a88e462461a04 100644
--- a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
@@ -4,6 +4,11 @@
 // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
index 99a28bd745853..d5e9520417a71 100644
--- a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
index badf7293f49cb..9e2416541e091 100644
--- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
index 2096a7c7a21f7..37e29e16edf40 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
index f1aeef51f1c31..b7442d24c6f1d 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
index d26f99a34c290..57b4c1cd3abdd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
index f2b7ca35a77d7..e78794733dc42 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
index a1d0373290cc6..8c3c5f0980c55 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
index 09486c4ebc8da..1fdf104310623 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
@@ -5,9 +5,11 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
-//
 // USM copy command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
index 8e89ecc693ed7..66a9589d9e242 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
index 401cdc7aa430f..ed959ca5a400b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
index 137b3df19cf02..86c533110f26b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
@@ -6,7 +6,10 @@
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 
 // USM memset command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp
index ff4984603d908..c574d65e9357f 100644
--- a/sycl/test-e2e/Graph/empty_graph.cpp
+++ b/sycl/test-e2e/Graph/empty_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // Tests the ability to finalize and submit a command graph which doesn't
 // contain any nodes.

From 4c8569c1ca3153b56ed03af71184292c8afdfa6d Mon Sep 17 00:00:00 2001
From: HPS-1 <65581261+HPS-1@users.noreply.github.com>
Date: Thu, 8 Feb 2024 07:29:38 -0500
Subject: [PATCH 11/26] [SYCL] Fix handling of unsupported alignment by
 aligned_alloc_xxx<T> (#12569)

Implementation is supposed to return `nullptr` when requested alignment is not supported. Since our runtime performs all allocations through Unified Runtime that means for us that any alignment which is not a power of two is unsupported.

Note that the resulting alignment may not be the same as requested one (per the SYCL 2020 specification) and therefore we can't just rely on return value of underlying non-templated version of the alloc function and have to perform the check explicitly.

There is an issue with some backends not properly returning error on an unsupported alignment, it wis reported in #11642

---------

Signed-off-by: Hu, Peisen <peisen.hu@intel.com>
---
 sycl/include/sycl/usm.hpp   |  31 +++++++++++
 sycl/test-e2e/USM/align.cpp | 102 ++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100755 sycl/test-e2e/USM/align.cpp

diff --git a/sycl/include/sycl/usm.hpp b/sycl/include/sycl/usm.hpp
index f9ab634edc81a..ff941e865dff8 100644
--- a/sycl/include/sycl/usm.hpp
+++ b/sycl/include/sycl/usm.hpp
@@ -154,6 +154,13 @@ __SYCL_EXPORT void *aligned_alloc(
     const property_list &propList,
     const detail::code_location &CodeLoc = detail::code_location::current());
 
+///
+// Helper function used to determine if the Alignment argument is a power of 2
+///
+inline size_t is_not_power_of_two(size_t Alignment) {
+  return (Alignment & (Alignment - 1));
+}
+
 ///
 // Template forms
 ///
@@ -179,6 +186,9 @@ T *aligned_alloc_device(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_device(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -189,6 +199,9 @@ T *aligned_alloc_device(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_device<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
@@ -230,6 +243,9 @@ T *aligned_alloc_host(
     size_t Alignment, size_t Count, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_host(std ::max(Alignment, alignof(T)),
                                              Count * sizeof(T), Ctxt, PropList,
                                              CodeLoc));
@@ -240,6 +256,9 @@ T *aligned_alloc_host(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_host<T>(Alignment, Count, Q.get_context(), PropList,
                                CodeLoc);
 }
@@ -249,6 +268,9 @@ T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_shared(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -259,6 +281,9 @@ T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_shared<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
@@ -286,6 +311,9 @@ T *aligned_alloc(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     usm::alloc Kind, const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc(max(Alignment, alignof(T)),
                                         Count * sizeof(T), Dev, Ctxt, Kind,
                                         PropList, CodeLoc));
@@ -296,6 +324,9 @@ T *aligned_alloc(
     size_t Alignment, size_t Count, const queue &Q, usm::alloc Kind,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc<T>(Alignment, Count, Q.get_device(), Q.get_context(),
                           Kind, PropList, CodeLoc);
 }
diff --git a/sycl/test-e2e/USM/align.cpp b/sycl/test-e2e/USM/align.cpp
new file mode 100755
index 0000000000000..9659ec78b9a88
--- /dev/null
+++ b/sycl/test-e2e/USM/align.cpp
@@ -0,0 +1,102 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// UNSUPPORTED: gpu
+
+// E2E tests for annotated USM allocation functions with alignment arguments
+// that are not powers of 2. Note this test does not work on gpu because some
+// tests expect non-templated aligned_alloc_xxx functions to return nullptr,
+// e.g. when the alignment argument is not a power of 2, while they fail to do
+// so when run on gpu. This maybe because the gpu runtime has different
+// behavior. Therefore, GPU is unsupported until issue #12638 gets resolved.
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <numeric>
+
+using namespace sycl;
+using namespace ext::oneapi::experimental;
+using namespace ext::intel::experimental;
+using alloc = usm::alloc;
+
+template <typename T> void testAlign(sycl::queue &q, unsigned align) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  constexpr int N = 10;
+  assert(align > 0 || (align & (align - 1)) == 0);
+
+  auto ADevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device(align, N, args...);
+  };
+  auto AHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host(align, N, args...);
+  };
+  auto AShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared(align, N, args...);
+  };
+  auto AAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc(align, N, args...);
+  };
+
+  auto ATDevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device<T>(align, N, args...);
+  };
+  auto ATHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host<T>(align, N, args...);
+  };
+  auto ATShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared<T>(align, N, args...);
+  };
+  auto ATAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc<T>(align, N, args...);
+  };
+
+  // Test cases that are expected to return null
+  auto check_null = [&q](auto AllocFn, int Line, int Case) {
+    decltype(AllocFn()) Ptr = AllocFn();
+    if (Ptr != nullptr) {
+      free(Ptr, q);
+      std::cout << "Failed at line " << Line << ", case " << Case << std::endl;
+      assert(false && "The return is not null!");
+    }
+  };
+
+  auto CheckNullAll = [&](auto Funcs, int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_null(Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  CheckNullAll(std::tuple{
+      // Case: aligned_alloc_xxx with no alignment property, and the alignment
+      // argument is not a power of 2, the result is nullptr
+      [&]() { return ADevice(3, q); }, [&]() { return ADevice(5, dev, Ctx); },
+      [&]() { return AHost(7, q); }, [&]() { return AHost(9, Ctx); },
+      [&]() { return AShared(114, q); },
+      [&]() { return AShared(1023, dev, Ctx); },
+      [&]() { return AAnnotated(15, q, alloc::device); },
+      [&]() { return AAnnotated(17, dev, Ctx, alloc::host); }
+      // Case: aligned_alloc_xxx<T> with no alignment property, and the
+      // alignment argument is not a power of 2, the result is nullptr
+      ,
+      [&]() { return ATDevice(3, q); }, [&]() { return ATDevice(5, dev, Ctx); },
+      [&]() { return ATHost(7, q); }, [&]() { return ATHost(9, Ctx); },
+      [&]() { return ATShared(1919, q); },
+      [&]() { return ATShared(11, dev, Ctx); },
+      [&]() { return ATAnnotated(15, q, alloc::device); },
+      [&]() { return ATAnnotated(17, dev, Ctx, alloc::host); }});
+}
+
+int main() {
+  sycl::queue q;
+  testAlign<char>(q, 4);
+  testAlign<int>(q, 128);
+  testAlign<std::complex<double>>(q, 4);
+  return 0;
+}

From 170be1f07ee0fc92add0bbee3df35f30cdecc067 Mon Sep 17 00:00:00 2001
From: Jefferson Le Quellec <jefferson.lequellec@codeplay.com>
Date: Thu, 8 Feb 2024 15:39:10 +0100
Subject: [PATCH 12/26] [SYCL][DOC] Update the 'constraints' for the complex
 specification (#12649)

This PR apply the modifications asked in the
https://github.com/intel/llvm/pull/11792 PR made by @Pennycook

---------

Co-authored-by: John Pennycook <john.pennycook@intel.com>
---
 .../experimental/sycl_ext_oneapi_complex.asciidoc    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
index 41700a898fecb..e4eb444f1ed60 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
@@ -10,6 +10,7 @@
 :encoding: utf-8
 :lang: en
 :dpcpp: pass:[DPC++]
+:endnote: &#8212;{nbsp}end{nbsp}note
 
 // Set the default source code type in this document to C++,
 // for syntax highlighting purposes.  This is needed because
@@ -101,8 +102,8 @@ available only in host code as noted below.
 The complex type is trivially copyable and type trait `is_device_copyable`
 should resolve to `std::true_type`.
 
-The `T` template parameter must be one of the types float, double, or
-sycl::half.
+_Constraints_: The `T` template parameter must be one of the types `float`,
+`double`, or `sycl::half`.
 
 Note: When performing operations between complex numbers and decimals,
 the decimal is treated as a complex number with a real component equal to
@@ -335,6 +336,13 @@ Additionally, this extension introduces support for the `real` and `imag` free
 functions, which returns the real and imaginary component of a number,
 respectively.
 
+[_Note:_ The overloads of the functions `real(T)` and `imag(T)` match the
+behavior in ISO C++ where `T` would be treated as a complex number with a zero
+imaginary component. This is subject to the constraint that `T` must be one of
+the types `float`, `double`, `sycl::half`, or evaluate to `true` for
+`std::is_integral`.
+_{endnote}_]
+
 These functions are available in both host and device code, and each math
 function should follow the C++ standard for handling `NaN` and `Inf` values.
 

From 698417a1571bb8f0d9c37487fa75f103ff8b1c37 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Thu, 8 Feb 2024 07:37:50 -0800
Subject: [PATCH 13/26] [SYCL] Drop 'acc' in favor of 'fpga' from
 ONEAPI_DEVICE_SELECTOR (#12614)

As per the ONEAPI_DEVICE_SELECTOR
[documentation](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector),
the device type can only be cpu, gpu, or fpga (or any combination of
those). Currently, 'acc' is also accepted by ONEAPI_DEVICE_SELECTOR as a
valid device type, which is incorrect.

This PR modifies drops support of 'acc' in ONEAPI_DEVICE_SELECTOR in
favor of 'fpga'. We have already updated existing test cases (#12551),
testing scripts (#12596 ) to use 'fpga' with ONEAPI_DEVICE_SELECTOR.
---
 sycl/source/detail/allowlist.cpp            |  6 ++++--
 sycl/source/detail/config.cpp               | 16 +++++++-------
 sycl/source/detail/config.hpp               |  6 ++++--
 sycl/source/detail/device_filter.cpp        | 23 ++++++++++++++-------
 sycl/unittests/allowlist/ParseAllowList.cpp |  3 ++-
 5 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp
index 881a014c4831f..83309ec9f2d92 100644
--- a/sycl/source/detail/allowlist.cpp
+++ b/sycl/source/detail/allowlist.cpp
@@ -166,7 +166,8 @@ AllowListParsedT parseAllowList(const std::string &AllowListRaw) {
         // valid. E.g., for BackendName key, the allowed values are only ones
         // described in SyclBeMap
         ValidateEnumValues(BackendNameKeyName, getSyclBeMap());
-        ValidateEnumValues(DeviceTypeKeyName, getSyclDeviceTypeMap());
+        ValidateEnumValues(DeviceTypeKeyName,
+                           getSyclDeviceTypeMap(true /*Enable 'acc'*/));
 
         if (Key == DeviceVendorIdKeyName) {
           // DeviceVendorId should have hex format
@@ -380,7 +381,8 @@ void applyAllowList(std::vector<sycl::detail::pi::PiDevice> &PiDevices,
         Device, PI_DEVICE_INFO_TYPE, sizeof(sycl::detail::pi::PiDeviceType),
         &PiDevType, nullptr);
     sycl::info::device_type DeviceType = pi::cast<info::device_type>(PiDevType);
-    for (const auto &SyclDeviceType : getSyclDeviceTypeMap()) {
+    for (const auto &SyclDeviceType :
+         getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
       if (SyclDeviceType.second == DeviceType) {
         const auto &DeviceTypeValue = SyclDeviceType.first;
         DeviceDesc[DeviceTypeKeyName] = DeviceTypeValue;
diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp
index f7760aa227168..7ae96d42e220d 100644
--- a/sycl/source/detail/config.cpp
+++ b/sycl/source/detail/config.cpp
@@ -165,14 +165,16 @@ void dumpConfig() {
 // TODO: host device type will be removed once sycl_ext_oneapi_filter_selector
 // is removed.
 const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap() {
+getSyclDeviceTypeMap(bool supportAcc) {
   static const std::array<std::pair<std::string, info::device_type>, 6>
-      SyclDeviceTypeMap = {{{"host", info::device_type::host},
-                            {"cpu", info::device_type::cpu},
-                            {"gpu", info::device_type::gpu},
-                            {"acc", info::device_type::accelerator},
-                            {"fpga", info::device_type::accelerator},
-                            {"*", info::device_type::all}}};
+      SyclDeviceTypeMap = {
+          {{"host", info::device_type::host},
+           {"cpu", info::device_type::cpu},
+           {"gpu", info::device_type::gpu},
+           /* Duplicate entries are fine as this map is one-directional.*/
+           {supportAcc ? "acc" : "fpga", info::device_type::accelerator},
+           {"fpga", info::device_type::accelerator},
+           {"*", info::device_type::all}}};
   return SyclDeviceTypeMap;
 }
 
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 8f048e0f95f60..1079f32caa388 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -232,8 +232,10 @@ template <> class SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS> {
 };
 
 // Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR.
+// The 'supportAcc' parameter is used by SYCL_DEVICE_ALLOWLIST which,
+// unlike ONEAPI_DEVICE_SELECTOR, also accepts 'acc' as a valid device type.
 const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap();
+getSyclDeviceTypeMap(bool supportAcc = false);
 
 // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and
 // ONEAPI_DEVICE_SELECTOR
@@ -514,7 +516,7 @@ template <> class SYCLConfig<SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE> {
       return Result;
 
     std::string ValueStr{ValueRaw};
-    auto DeviceTypeMap = getSyclDeviceTypeMap();
+    auto DeviceTypeMap = getSyclDeviceTypeMap(true /*Enable 'acc'*/);
 
     // Iterate over all configurations.
     size_t Start = 0, End = 0;
diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp
index 311ebeaa174b8..eb3d0f83ed26e 100644
--- a/sycl/source/detail/device_filter.cpp
+++ b/sycl/source/detail/device_filter.cpp
@@ -93,9 +93,13 @@ static void Parse_ODS_Device(ods_target &Target,
   std::string_view TopDeviceStr = DeviceSubTuple[0];
 
   // Handle explicit device type (e.g. 'gpu').
-  auto DeviceTypeMap =
-      getSyclDeviceTypeMap(); // <-- std::array<std::pair<std::string,
-                              // info::device::type>>
+  auto DeviceTypeMap = getSyclDeviceTypeMap(
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+      true /*Enable 'acc'*/
+#endif
+  ); // <-- std::array<std::pair<std::string,
+     // info::device::type>>
+
   auto It =
       std::find_if(std::begin(DeviceTypeMap), std::end(DeviceTypeMap),
                    [&](auto DtPair) { return TopDeviceStr == DtPair.first; });
@@ -262,7 +266,11 @@ Parse_ONEAPI_DEVICE_SELECTOR(const std::string &envString) {
 std::ostream &operator<<(std::ostream &Out, const ods_target &Target) {
   Out << Target.Backend;
   if (Target.DeviceType) {
-    auto DeviceTypeMap = getSyclDeviceTypeMap();
+    auto DeviceTypeMap = getSyclDeviceTypeMap(
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+        true /*Enable 'acc'*/
+#endif
+    );
     auto Match = std::find_if(
         DeviceTypeMap.begin(), DeviceTypeMap.end(),
         [&](auto Pair) { return (Pair.second == Target.DeviceType); });
@@ -335,11 +343,12 @@ device_filter::device_filter(const std::string &FilterString) {
   if (TripleValueID >= Tokens.size()) {
     DeviceType = info::device_type::all;
   } else {
-    auto Iter = std::find_if(std::begin(getSyclDeviceTypeMap()),
-                             std::end(getSyclDeviceTypeMap()), FindElement);
+    auto Iter = std::find_if(
+        std::begin(getSyclDeviceTypeMap(true /*Enable 'acc'*/)),
+        std::end(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), FindElement);
     // If no match is found, set device_type 'all',
     // which actually means 'any device_type' will be a match.
-    if (Iter == getSyclDeviceTypeMap().end())
+    if (Iter == getSyclDeviceTypeMap(true /*Enable 'acc'*/).end())
       DeviceType = info::device_type::all;
     else {
       DeviceType = Iter->second;
diff --git a/sycl/unittests/allowlist/ParseAllowList.cpp b/sycl/unittests/allowlist/ParseAllowList.cpp
index faecab30aaeaf..40fbceb76616e 100644
--- a/sycl/unittests/allowlist/ParseAllowList.cpp
+++ b/sycl/unittests/allowlist/ParseAllowList.cpp
@@ -178,7 +178,8 @@ TEST(ParseAllowListTests, CheckAllValidBackendNameValuesAreProcessed) {
 
 TEST(ParseAllowListTests, CheckAllValidDeviceTypeValuesAreProcessed) {
   std::string AllowList;
-  for (const auto &SyclDeviceType : sycl::detail::getSyclDeviceTypeMap()) {
+  for (const auto &SyclDeviceType :
+       sycl::detail::getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
     if (!AllowList.empty())
       AllowList += "|";
     AllowList += "DeviceType:" + SyclDeviceType.first;

From e2b48a4f2d40cd69d5324cbefdb3cb36c457380d Mon Sep 17 00:00:00 2001
From: Jefferson Le Quellec <jefferson.lequellec@codeplay.com>
Date: Thu, 8 Feb 2024 16:56:38 +0100
Subject: [PATCH 14/26] [SYCL][DOC] Trim trailing whitespace and fix typo
 (#12646)

This simple PR fix the `libspriv` typo and removes the unnecessary
trailing whitespace.
---
 sycl/doc/design/CompilerAndRuntimeDesign.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index c9d9e2b7eba90..ccc0f642f992f 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -567,8 +567,8 @@ Unlike other AOT targets, the bitcode module linked from intermediate compiled
 objects never goes through SPIR-V. Instead it is passed directly in bitcode form
 down to the NVPTX Back End. All produced bitcode depends on two libraries,
 `libdevice.bc` (provided by the CUDA SDK) and `libspirv-nvptx64--nvidiacl.bc` variants
-(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. 
-Instead it is used to generate remangled variants 
+(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly.
+Instead it is used to generate remangled variants
 `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` and
 `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` to handle primitive type
 differences between Linux and Windows.
@@ -600,14 +600,14 @@ path in SYCL kernels.
 
 ##### NVPTX Builtins
 
-Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` 
+Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long`
 types as 64 bit and has no `long long` types while Windows DPC++ treats `long`
-types like 32-bit integers and `long long` types like 64-bit integers. 
-Differences between the primitive types can cause applications to use 
-incompatible libclc built-ins. A remangler creates multiple libspriv files 
-with different remangled function names to support both Windows and Linux. 
-When building a SYCL application targeting the CUDA backend the driver 
-will link the device code with 
+types like 32-bit integers and `long long` types like 64-bit integers.
+Differences between the primitive types can cause applications to use
+incompatible libclc built-ins. A remangler creates multiple libspirv files
+with different remangled function names to support both Windows and Linux.
+When building a SYCL application targeting the CUDA backend the driver
+will link the device code with
 `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
 Windows or it will link the device code with
 `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
@@ -916,7 +916,7 @@ template <typename T, address_space AS> class multi_ptr {
   // DecoratedType<T, global_space>::type == "__attribute__((opencl_global)) T"
   // See sycl/include/sycl/access/access.hpp for more details
   using pointer_t = typename DecoratedType<T, AS>::type *;
- 
+
   pointer_t m_Pointer;
   public:
   pointer_t get() { return m_Pointer; }

From 48f60d5c603de9dc74dd30936976c0d94c0afdaf Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Thu, 8 Feb 2024 17:06:11 +0100
Subject: [PATCH 15/26] [SYCL][NFC] Reduce ambiguity between discarding SYCL
 and PI events (#12547)

The old variable name for support of discarding PI events
(MHasDiscardEventsSupport) made it easy to confuse with discarding SYCL
events (MDiscardEvents), especially since we could have a situation
where MHasDiscardEventsSupport is false, but MDiscardEvents is true.

Rename the variable for discarding PI events and reword its comment to
make the distinction clearer.
---
 sycl/source/detail/queue_impl.cpp         |  2 +-
 sycl/source/detail/queue_impl.hpp         | 27 ++++++++++++-----------
 sycl/source/detail/scheduler/commands.cpp | 10 ++++-----
 sycl/source/handler.cpp                   |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 545cff3b9c5c3..ab066ed1906a4 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -354,7 +354,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
         getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
 
     if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
-      if (MHasDiscardEventsSupport) {
+      if (MSupportsDiscardingPiEvents) {
         MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
                   /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
         return createDiscardedEvent();
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index ddd6a71d7db80..0fe4242cc9472 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -115,8 +115,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     if (has_property<property::queue::enable_profiling>()) {
@@ -292,8 +292,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     queue_impl_interop(PiQueue);
@@ -317,8 +317,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)) {
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)) {
     queue_impl_interop(PiQueue);
   }
 
@@ -374,7 +374,9 @@ class queue_impl {
   bool is_host() const { return MHostQueue; }
 
   /// \return true if this queue has discard_events support.
-  bool has_discard_events_support() const { return MHasDiscardEventsSupport; }
+  bool supportsDiscardingPiEvents() const {
+    return MSupportsDiscardingPiEvents;
+  }
 
   bool isInOrder() const { return MIsInorder; }
 
@@ -970,12 +972,11 @@ class queue_impl {
   const bool MIsProfilingEnabled;
 
 protected:
-  // This flag says if we can discard events based on a queue "setup" which will
-  // be common for all operations submitted to the queue. This is a must
-  // condition for discarding, but even if it's true, in some cases, we won't be
-  // able to discard events, because the final decision is made right before the
-  // operation itself.
-  const bool MHasDiscardEventsSupport;
+  // Indicates whether the queue supports discarding PI events for tasks
+  // submitted to it. This condition is necessary but not sufficient, PI events
+  // should be discarded only if they also don't represent potential implicit
+  // dependencies for future tasks in other queues.
+  const bool MSupportsDiscardingPiEvents;
 
   // Command graph which is associated with this queue for the purposes of
   // recording commands to it.
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 23b6eec33886e..955adae8423dc 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2704,7 +2704,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
   }
 
   sycl::detail::pi::PiEvent *Event =
-      (MQueue->has_discard_events_support() &&
+      (MQueue->supportsDiscardingPiEvents() &&
        MCommandGroup->getRequirements().size() == 0)
           ? nullptr
           : &MEvent->getHandleRef();
@@ -2851,11 +2851,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   auto RawEvents = getPiEvents(EventImpls);
   flushCrossQueueDeps(EventImpls, getWorkerQueue());
 
-  bool DiscardEvent = (MQueue->has_discard_events_support() &&
-                       MCommandGroup->getRequirements().size() == 0);
+  bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() &&
+                         MCommandGroup->getRequirements().size() == 0);
   sycl::detail::pi::PiEvent *Event =
-      DiscardEvent ? nullptr : &MEvent->getHandleRef();
-  detail::EventImplPtr EventImpl = DiscardEvent ? nullptr : MEvent;
+      DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
+  detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
 
   switch (MCommandGroup->getType()) {
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 02ffef951d1b5..70681a7504358 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -292,7 +292,7 @@ event handler::finalize() {
       };
 
       bool DiscardEvent = false;
-      if (MQueue->has_discard_events_support()) {
+      if (MQueue->supportsDiscardingPiEvents()) {
         // Kernel only uses assert if it's non interop one
         bool KernelUsesAssert =
             !(MKernel && MKernel->isInterop()) &&

From 2dc080eba3b071853b7587d53654f8b544c5f2fa Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Thu, 8 Feb 2024 17:18:08 +0100
Subject: [PATCH 16/26] [SYCL][NFC] Refresh inline asm E2E tests (#12642)

This PR should improve tests stability and transparency (when it comes
to unsupported tests). List of changes:

- switched from `[[intel::reqd_sub_group_size]]` to
`[[sycl::reqd_sub_group_size]]` attribute;
- removed uses of deprecated `device::has_extension` API: there is no
need for that check, because, `[[sycl::reqd_sub_group_size]]` is a core
SYCL 2020 functionality;
- simplified helpers to remove unnecessary extra arguments which could
be inferred otherwise;
- updated `REQUIRES` directives to include required sub-group size
requirements in there. Adjusted calls to `launchInlineASMTest` helper to
consistently pass required sub-group size in there as well;
---
 .../InlineAsm/Negative/asm_bad_opcode.cpp     |  6 +--
 .../Negative/asm_bad_operand_syntax.cpp       |  6 +--
 .../Negative/asm_duplicate_label.cpp          |  6 +--
 .../Negative/asm_illegal_exec_size.cpp        |  6 +--
 .../InlineAsm/Negative/asm_missing_label.cpp  |  6 +--
 .../InlineAsm/Negative/asm_missing_region.cpp |  6 +--
 .../InlineAsm/Negative/asm_simple.cpp         |  6 +--
 .../InlineAsm/Negative/asm_undefined_decl.cpp |  6 +--
 .../InlineAsm/Negative/asm_undefined_pred.cpp |  6 +--
 .../InlineAsm/Negative/asm_wrong_declare.cpp  |  6 +--
 sycl/test-e2e/InlineAsm/asm_16_empty.cpp      |  6 +--
 .../test-e2e/InlineAsm/asm_16_matrix_mult.cpp |  6 +--
 .../InlineAsm/asm_16_no_input_int.cpp         |  6 +--
 sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp    |  6 +--
 sycl/test-e2e/InlineAsm/asm_8_empty.cpp       |  6 +--
 .../test-e2e/InlineAsm/asm_8_no_input_int.cpp |  6 +--
 .../InlineAsm/asm_arbitrary_ops_order.cpp     |  6 +--
 sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp |  6 +--
 sycl/test-e2e/InlineAsm/asm_float_add.cpp     |  6 +--
 sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp |  6 +--
 sycl/test-e2e/InlineAsm/asm_float_neg.cpp     |  6 +--
 sycl/test-e2e/InlineAsm/asm_if.cpp            |  6 +--
 sycl/test-e2e/InlineAsm/asm_imm_arg.cpp       |  6 +--
 sycl/test-e2e/InlineAsm/asm_loop.cpp          |  6 +--
 sycl/test-e2e/InlineAsm/asm_mul.cpp           |  6 +--
 .../InlineAsm/asm_multiple_instructions.cpp   |  6 +--
 sycl/test-e2e/InlineAsm/asm_no_operands.cpp   | 18 ++++----
 sycl/test-e2e/InlineAsm/asm_no_output.cpp     |  6 +--
 sycl/test-e2e/InlineAsm/asm_plus_mod.cpp      |  6 +--
 sycl/test-e2e/InlineAsm/asm_switch.cpp        |  6 +--
 sycl/test-e2e/InlineAsm/include/asmhelper.h   | 41 ++++++++-----------
 sycl/test-e2e/InlineAsm/letter_example.cpp    |  9 ++--
 sycl/test-e2e/InlineAsm/malloc_shared_32.cpp  |  9 ++--
 .../InlineAsm/malloc_shared_in_out_dif.cpp    |  9 ++--
 .../InlineAsm/malloc_shared_no_input.cpp      |  9 ++--
 35 files changed, 125 insertions(+), 144 deletions(-)

diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
index 935df8cab7afa..a16b903c09a84 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
index cf21d2cd84714..ea7434dd31498 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
index 50f63b777d647..58978e203d8a8 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
index 053c48e53e353..98d9b1ff1520d 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
index 4754d32a57b77..47f0fd98311ae 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
index cf7ab9f6d8a6e..395eb4af68c03 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
index 4a24d6dbd441e..e36a15cf1cbf4 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
index ec6f1026d9bea..53b64cf2ba2fd 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
index b76357962f0fc..0ce42082fe2ef 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
index 3b2d8aa1353ee..86151cdc7cab7 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
index 71ca270f231cd..78321716777ac 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip_nvidia
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           C[wiID] = 43;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("");
@@ -30,7 +30,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 43))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
index c28616ee70750..00c3eb3830f38 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
index c28616ee70750..00c3eb3830f38 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
index 11d647cdfbe1d..812e30a7c9f56 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           for (int i = 0; i < 10; ++i) {
 #if defined(__SYCL_DEVICE_ONLY__)
             asm("fence_sw");
@@ -35,7 +35,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 45))
diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
index d43b9cd83ac81..26d0952225901 100644
--- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip_nvidia
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-8
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] {
           C[wiID] = 43;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("");
@@ -30,7 +30,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f, true, false, {8}))
+  if (!launchInlineASMTest(f, {8}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 43))
diff --git a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
index ab15d58164e3f..e671acc825e7b 100644
--- a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-8
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f, true, false, {8}))
+  if (!launchInlineASMTest(f, {8}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
index 401dfbcacb63c..739feed41d6c9 100644
--- a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -33,7 +33,7 @@ struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mad (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0> %3(0, "
               "0)<1;1,0>"
@@ -56,7 +56,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB, inputC);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &D = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
index dd7cbc6739816..60d200d1c99df 100644
--- a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
     // declaration of temp within and outside the scope
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("{\n"
@@ -59,7 +59,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_add.cpp b/sycl/test-e2e/InlineAsm/asm_float_add.cpp
index bd4298b0ddad3..b6374c96d2010 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_add.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_add.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -30,7 +30,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
               : "=rw"(C[wiID])
@@ -51,7 +51,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
index 51cfa81149118..948d983554c4e 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -27,7 +27,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
               : "=rw"(B[wiID])
@@ -45,7 +45,7 @@ int main() {
     input[i] = (float)1 / std::pow(2, i);
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
index 39176ab355c96..43ad56e41222d 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mov (M1, 16) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
               : "=rw"(B[wiID])
@@ -45,7 +45,7 @@ int main() {
     input[i] = 1.0 / i;
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &R = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_if.cpp b/sycl/test-e2e/InlineAsm/asm_if.cpp
index 9cee76efee2e0..679980a62aaf7 100644
--- a/sycl/test-e2e/InlineAsm/asm_if.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_if.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -18,7 +18,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     bool switchField = false;
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
@@ -42,7 +42,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   if (verify_all_the_same(Functor.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
index 48415e1445af0..d2a53f14691da 100644
--- a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -26,7 +26,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
               : "=rw"(B[wiID])
@@ -44,7 +44,7 @@ int main() {
     input[i] = i;
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_loop.cpp b/sycl/test-e2e/InlineAsm/asm_loop.cpp
index 0c1cc06568d6a..56518dffbf4e3 100644
--- a/sycl/test-e2e/InlineAsm/asm_loop.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_loop.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
             CGH);
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
                        ".decl P1 v_type=P num_elts=16\n"
@@ -67,7 +67,7 @@ int main() {
   }
 
   KernelFunctor<> Functor(InputA, InputB);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   auto &C = Functor.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_mul.cpp b/sycl/test-e2e/InlineAsm/asm_mul.cpp
index 3f2b91c53cf59..b038a79f8abb1 100644
--- a/sycl/test-e2e/InlineAsm/asm_mul.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_mul.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -28,7 +28,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
               : "=rw"(C[wiID])
@@ -49,7 +49,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
index f51f5f58501ca..cdcf08b0f809d 100644
--- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -DTO_PASS -o %t.out.pass
 // RUN: %{run} %t.out.pass
 // RUN: %{build} -o %t.out
@@ -36,7 +36,7 @@ struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(TO_PASS)
           // The code below passing verification
           volatile int output = -1;
@@ -85,7 +85,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB, inputC);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(),
diff --git a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
index 5037fd68af41c..5b770bb5baa0d 100644
--- a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -12,10 +12,7 @@ int main() {
   sycl::queue Queue;
   sycl::device Device = Queue.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -25,12 +22,13 @@ int main() {
   // Submitting command group(work) to queue
   Queue.submit([&](sycl::handler &cgh) {
     // Executing kernel
-    cgh.parallel_for<no_operands_kernel>(
-        NumOfWorkItems,
-        [=](sycl::id<1> WIid) [[intel::reqd_sub_group_size(16)]] {
+    // clang-format off
+    cgh.parallel_for<no_operands_kernel>(NumOfWorkItems,
+        [=](sycl::id<1> WIid) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
-          asm("barrier");
+      asm("barrier");
 #endif
-        });
+    });
+    // clang-format on
   });
 }
diff --git a/sycl/test-e2e/InlineAsm/asm_no_output.cpp b/sycl/test-e2e/InlineAsm/asm_no_output.cpp
index a6d44b35a63dd..0622f4f8edb38 100644
--- a/sycl/test-e2e/InlineAsm/asm_no_output.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_no_output.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int local_var = 47;
           local_var += C[0];
 #if defined(__SYCL_DEVICE_ONLY__)
@@ -37,7 +37,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 0))
diff --git a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
index 10157938517bf..db2659d8e7077 100644
--- a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
               : "+rw"(B[wiID])
@@ -47,7 +47,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_switch.cpp b/sycl/test-e2e/InlineAsm/asm_switch.cpp
index e77a7e6edc531..ccd81fd34af45 100644
--- a/sycl/test-e2e/InlineAsm/asm_switch.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_switch.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -18,7 +18,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     int switchField = 2;
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
@@ -62,7 +62,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   if (verify_all_the_same(Functor.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/include/asmhelper.h b/sycl/test-e2e/InlineAsm/include/asmhelper.h
index f96d056dcfec7..abc7d24668691 100644
--- a/sycl/test-e2e/InlineAsm/include/asmhelper.h
+++ b/sycl/test-e2e/InlineAsm/include/asmhelper.h
@@ -91,8 +91,8 @@ auto exception_handler = [](sycl::exception_list exceptions) {
 };
 
 template <typename F>
-bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
-                             std::vector<int> RequiredSGSizes = {}) {
+bool launchInlineASMTestImpl(F &f,
+                             const std::vector<int> &RequiredSGSizes = {}) {
   sycl::queue deviceQueue(sycl::gpu_selector_v, exception_handler);
   sycl::device device = deviceQueue.get_device();
 
@@ -101,22 +101,19 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
     return false;
   }
 
-  auto Vec = device.get_info<sycl::info::device::extensions>();
-  if (requires_particular_sg_size &&
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
-    std::cout << "Skipping test\n";
-    return false;
-  }
-
-  auto sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
-  if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(),
-                  [&](size_t RequiredSGSize) {
-                    return std::find(sg_sizes.begin(), sg_sizes.end(),
-                                     RequiredSGSize) == sg_sizes.end();
-                  })) {
-    std::cout << "Skipping test\n";
-    return false;
+  if (!RequiredSGSizes.empty()) {
+    auto supported_sg_sizes =
+        device.get_info<sycl::info::device::sub_group_sizes>();
+    if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(),
+                    [&](size_t RequiredSGSize) {
+                      return std::find(supported_sg_sizes.begin(),
+                                       supported_sg_sizes.end(),
+                                       RequiredSGSize) ==
+                             supported_sg_sizes.end();
+                    })) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
   }
 
   deviceQueue.submit(f).wait_and_throw();
@@ -128,13 +125,11 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
 ///
 /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
 template <typename F>
-bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true,
-                         bool exception_expected = false,
-                         std::vector<int> RequiredSGSizes = {}) {
+bool launchInlineASMTest(F &f, const std::vector<int> &RequiredSGSizes = {},
+                         bool exception_expected = false) {
   bool result = false;
   try {
-    result = launchInlineASMTestImpl(f, requires_particular_sg_size,
-                                     RequiredSGSizes);
+    result = launchInlineASMTestImpl(f, RequiredSGSizes);
   } catch (sycl::exception &e) {
     std::string what = e.what();
     if (exception_expected &&
diff --git a/sycl/test-e2e/InlineAsm/letter_example.cpp b/sycl/test-e2e/InlineAsm/letter_example.cpp
index 393c362c33fe0..c09c6d9f75ae4 100644
--- a/sycl/test-e2e/InlineAsm/letter_example.cpp
+++ b/sycl/test-e2e/InlineAsm/letter_example.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,10 +15,7 @@ int main() {
   sycl::queue q;
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -31,7 +28,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
            int i = idx[0];
            asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
index be2ee81836261..c541ac4d17838 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-32,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -16,10 +16,7 @@ int main() {
 
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -40,7 +37,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(32)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(32)]] {
            int i = idx[0];
 #if defined(__SYCL_DEVICE_ONLY__)
            asm volatile(R"a(
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
index 0470bb2431e9a..5ea597d5d1064 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -16,10 +16,7 @@ int main() {
 
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -37,7 +34,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
            int i = idx[0];
            volatile int tmp = a[i];
            tmp += 1;
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
index 76ad3f6e95260..3da4e4c7e7e71 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,10 +15,7 @@ int main() {
   sycl::queue q;
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -31,7 +28,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
            int i = idx[0];
 #if defined(__SYCL_DEVICE_ONLY__)
            asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" : "=rw"(a[i]));

From 89eccced17d0654659516b2709327bd09729729e Mon Sep 17 00:00:00 2001
From: Maksim Sabianin <maksim.sabianin@intel.com>
Date: Thu, 8 Feb 2024 17:37:03 +0100
Subject: [PATCH 17/26] [SYCL] Move module splitting functionality from
 sycl-post-link to SYCLLowerIR (#12622)

This is a part of migration to New Offloading model and
clang-linker-wrapper tool.
The signature of the verifyNoCrossModuleDeviceGlobalUsage is changed so
that it returns an Error instead of aborting the executable.
---
 .../llvm/SYCLLowerIR}/ModuleSplitter.h        | 10 +++--
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |  1 +
 .../SYCLLowerIR}/ModuleSplitter.cpp           | 43 ++++++++++++-------
 llvm/tools/sycl-post-link/CMakeLists.txt      |  1 -
 .../sycl-post-link/SYCLDeviceRequirements.cpp |  2 +-
 llvm/tools/sycl-post-link/sycl-post-link.cpp  |  9 ++--
 6 files changed, 42 insertions(+), 24 deletions(-)
 rename llvm/{tools/sycl-post-link => include/llvm/SYCLLowerIR}/ModuleSplitter.h (97%)
 rename llvm/{tools/sycl-post-link => lib/SYCLLowerIR}/ModuleSplitter.cpp (97%)

diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
similarity index 97%
rename from llvm/tools/sycl-post-link/ModuleSplitter.h
rename to llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
index aaf6108325765..eb09e7528ff49 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.h
+++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
@@ -10,7 +10,8 @@
 // of the split is new modules containing corresponding callgraph.
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
+#define LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -18,6 +19,7 @@
 #include "llvm/Support/Error.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 namespace llvm {
@@ -229,8 +231,8 @@ class ModuleSplitterBase {
   // For device global variables with the 'device_image_scope' property,
   // the function checks that there are no usages of a single device global
   // variable from kernels grouped to different modules. Otherwise, an error is
-  // issued and the tool is aborted.
-  void verifyNoCrossModuleDeviceGlobalUsage();
+  // returned.
+  Error verifyNoCrossModuleDeviceGlobalUsage();
 
   virtual ~ModuleSplitterBase() = default;
 
@@ -262,3 +264,5 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
 } // namespace module_split
 
 } // namespace llvm
+
+#endif // LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index cfafbe1a66c87..b2afa8150aa4f 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -62,6 +62,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   LowerInvokeSimd.cpp
   LowerWGLocalMemory.cpp
   LowerWGScope.cpp
+  ModuleSplitter.cpp
   MutatePrintfAddrspace.cpp
   SYCLAddOptLevelAttribute.cpp
   SYCLPropagateAspectsUsage.cpp
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
similarity index 97%
rename from llvm/tools/sycl-post-link/ModuleSplitter.cpp
rename to llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index 87cbf42da2df2..92ff992141945 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -8,9 +8,7 @@
 // See comments in the header.
 //===----------------------------------------------------------------------===//
 
-#include "ModuleSplitter.h"
-#include "Support.h"
-
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -23,6 +21,7 @@
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
@@ -426,14 +425,15 @@ class ModuleSplitter : public ModuleSplitterBase {
   DependencyGraph CG;
 };
 } // namespace
+
 namespace llvm {
 namespace module_split {
 
-void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
+Error ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
   const Module &M = getInputModule();
   // Early exit if there is only one group
   if (Groups.size() < 2)
-    return;
+    return Error::success();
 
   // Reverse the EntryPointGroupMap to get a map of entry point -> module's name
   unsigned EntryPointNumber = 0;
@@ -451,19 +451,25 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
 
     std::optional<StringRef> VarEntryPointModule{};
     auto CheckEntryPointModule = [&VarEntryPointModule, &EntryPointModules,
-                                  &GV](const auto *F) {
+                                  &GV](const auto *F) -> Error {
       auto EntryPointModulesIt = EntryPointModules.find(F);
-      assert(EntryPointModulesIt != EntryPointModules.end() &&
-             "There is no group for an entry point");
+      if (EntryPointModulesIt == EntryPointModules.end())
+        return createStringError(inconvertibleErrorCode(),
+                                 "There is no group for an entry point");
+
       if (!VarEntryPointModule.has_value()) {
         VarEntryPointModule = EntryPointModulesIt->second;
-        return;
-      }
-      if (EntryPointModulesIt->second != *VarEntryPointModule) {
-        error("device_global variable '" + Twine(GV.getName()) +
-              "' with property \"device_image_scope\" is used in more "
-              "than one device image.");
+        return Error::success();
       }
+
+      if (EntryPointModulesIt->second != *VarEntryPointModule)
+        return createStringError(
+            inconvertibleErrorCode(),
+            "device_global variable '" + Twine(GV.getName()) +
+                "' with property \"device_image_scope\" is used in more "
+                "than one device image.");
+
+      return Error::success();
     };
 
     SmallSetVector<const User *, 32> Workqueue;
@@ -478,13 +484,18 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
         continue;
       }
       if (auto *F = dyn_cast<const Function>(U)) {
-        if (EntryPointModules.count(F))
-          CheckEntryPointModule(F);
+        if (EntryPointModules.count(F)) {
+          auto E = CheckEntryPointModule(F);
+          if (E)
+            return E;
+        }
       }
       for (auto *UU : U->users())
         Workqueue.insert(UU);
     }
   }
+
+  return Error::success();
 }
 
 #ifndef NDEBUG
diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt
index 40bd3f899e487..3905e836aaae8 100644
--- a/llvm/tools/sycl-post-link/CMakeLists.txt
+++ b/llvm/tools/sycl-post-link/CMakeLists.txt
@@ -24,7 +24,6 @@ include_directories(
 
 add_llvm_tool(sycl-post-link
   sycl-post-link.cpp
-  ModuleSplitter.cpp
   SpecConstants.cpp
   SYCLDeviceLibReqMask.cpp
   SYCLKernelParamOptInfo.cpp
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
index 70d3a90f51785..5255ce7bf2a66 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "SYCLDeviceRequirements.h"
-#include "ModuleSplitter.h"
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/Support/PropertySetIO.h"
 
 #include <set>
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 85728be8aabf0..068962b63c1fc 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -13,7 +13,6 @@
 // - specialization constant intrinsic transformation
 //===----------------------------------------------------------------------===//
 
-#include "ModuleSplitter.h"
 #include "SYCLDeviceLibReqMask.h"
 #include "SYCLDeviceRequirements.h"
 #include "SYCLKernelParamOptInfo.h"
@@ -40,6 +39,7 @@
 #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
 #include "llvm/SYCLLowerIR/HostPipes.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -1009,8 +1009,11 @@ processInputModule(std::unique_ptr<Module> M) {
   Modified |= SplitOccurred;
 
   // FIXME: this check is not performed for ESIMD splits
-  if (DeviceGlobals)
-    Splitter->verifyNoCrossModuleDeviceGlobalUsage();
+  if (DeviceGlobals) {
+    auto E = Splitter->verifyNoCrossModuleDeviceGlobalUsage();
+    if (E)
+      error(toString(std::move(E)));
+  }
 
   // It is important that we *DO NOT* preserve all the splits in memory at the
   // same time, because it leads to a huge RAM consumption by the tool on bigger

From 7f5ade2bd05cc8686bd50f6356000dc7eba5bcb7 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 8 Feb 2024 09:44:15 -0800
Subject: [PATCH 18/26] [NFC][SYCL] Move helpers from builtins_preview.hpp to
 their single uses (#12654)

There are some issues with them as well (both in naming and bugs in
implementation), to be fixed in a separate PR to ease review.
---
 sycl/include/sycl/builtins_preview.hpp        | 23 -------------------
 .../detail/builtins/integer_functions.inc     | 12 ++++++++++
 .../detail/builtins/relational_functions.inc  | 14 +++++++++++
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp
index bc497c540693b..dfc44e5848f91 100644
--- a/sycl/include/sycl/builtins_preview.hpp
+++ b/sycl/include/sycl/builtins_preview.hpp
@@ -175,12 +175,6 @@ auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) {
   }
 }
 
-template <typename T>
-struct any_elem_type
-    : std::bool_constant<check_type_in_v<
-          get_elem_type_t<T>, float, double, half, char, signed char, short,
-          int, long, long long, unsigned char, unsigned short, unsigned int,
-          unsigned long, unsigned long long>> {};
 template <typename T>
 struct fp_elem_type
     : std::bool_constant<
@@ -188,16 +182,6 @@ struct fp_elem_type
 template <typename T>
 struct float_elem_type
     : std::bool_constant<check_type_in_v<get_elem_type_t<T>, float>> {};
-template <typename T>
-struct integer_elem_type
-    : std::bool_constant<
-          check_type_in_v<get_elem_type_t<T>, char, signed char, short, int,
-                          long, long long, unsigned char, unsigned short,
-                          unsigned int, unsigned long, unsigned long long>> {};
-template <typename T>
-struct suint32_elem_type
-    : std::bool_constant<
-          check_type_in_v<get_elem_type_t<T>, int32_t, uint32_t>> {};
 
 template <typename... Ts>
 struct same_basic_shape : std::bool_constant<builtin_same_shape_v<Ts...>> {};
@@ -244,13 +228,6 @@ struct builtin_enable
                               SHAPE_CHECKER, EXTRA_CONDITIONS, Ts...>::type;   \
   }
 } // namespace detail
-
-BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type,
-                       any_shape, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type,
-                       any_elem_type, scalar_only, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type,
-                       any_elem_type, non_scalar_only, same_elem_type)
 } // namespace _V1
 } // namespace sycl
 
diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc
index 455abf07aa250..dab0cee8a647a 100644
--- a/sycl/include/sycl/detail/builtins/integer_functions.inc
+++ b/sycl/include/sycl/detail/builtins/integer_functions.inc
@@ -12,6 +12,18 @@
 
 namespace sycl {
 inline namespace _V1 {
+namespace detail {
+template <typename T>
+struct integer_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, char, signed char, short, int,
+                          long, long long, unsigned char, unsigned short,
+                          unsigned int, unsigned long, unsigned long long>> {};
+template <typename T>
+struct suint32_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, int32_t, uint32_t>> {};
+} // namespace detail
 BUILTIN_CREATE_ENABLER(builtin_enable_integer, default_ret_type,
                        integer_elem_type, any_shape, same_elem_type)
 BUILTIN_CREATE_ENABLER(builtin_enable_integer_non_scalar, default_ret_type,
diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
index d23f144676cdb..0a38d79d20589 100644
--- a/sycl/include/sycl/detail/builtins/relational_functions.inc
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -13,6 +13,13 @@
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
+template <typename T>
+struct any_elem_type
+    : std::bool_constant<check_type_in_v<
+          get_elem_type_t<T>, float, double, half, char, signed char, short,
+          int, long, long long, unsigned char, unsigned short, unsigned int,
+          unsigned long, unsigned long long>> {};
+
 template <typename T>
 struct rel_ret_traits
     : std::conditional<is_scalar_arithmetic_v<T>, bool,
@@ -21,6 +28,13 @@ struct rel_ret_traits
                            same_size_signed_int_t<simplify_if_swizzle_t<T>>>> {
 };
 } // namespace detail
+
+BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type,
+                       any_shape, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type,
+                       any_elem_type, scalar_only, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type,
+                       any_elem_type, non_scalar_only, same_elem_type)
 BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type,
                        non_scalar_only, same_elem_type)
 

From 573e28b4085c89c1f65ab70da5d83d34ae9f0bb7 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 8 Feb 2024 20:35:04 +0100
Subject: [PATCH 19/26] [UR] update UR commit (#12658)

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 6a91815bc890f..2ea2b2f37622a 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 1cd402ead34a54459a6acb85777fbec105a178a0
-  # Merge: 5b89ee8b c8e150c5
+  # commit 9babc4d092a92c1036791d26ef328e5eeaf19803
+  # Merge: 3be8f205 90498ec5
   # Author: aarongreig <aaron.greig@codeplay.com>
-  # Date:   Tue Feb 6 14:48:55 2024 +0000
-  #     Merge pull request #1218 from Bensuo/maxime/imm-cmd-list-support
-  #     [EXP][CMDBUF] L0 Immediate command-list support
-  set(UNIFIED_RUNTIME_TAG 1cd402ead34a54459a6acb85777fbec105a178a0)
+  # Date:   Thu Feb 8 15:44:54 2024 +0000
+  #     Merge pull request #1321 from pbalcer/adapter-compute-constructor
+  #     [L0] move adapter init into its constructor from urAdapterGet
+  set(UNIFIED_RUNTIME_TAG 9babc4d092a92c1036791d26ef328e5eeaf19803)
   
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From 7ee7e906e858c229cc256185560968754b773091 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:50:46 -0800
Subject: [PATCH 20/26] [SYCL][ESIMD] Use LLVM IR for USM/SLM scatter (#12628)

---
 llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp     | 38 +++++++++++++++++++
 .../ext/intel/esimd/detail/memory_intrin.hpp  | 14 +++++++
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 19 +++++++++-
 .../ESIMD/unified_memory_api/scatter_usm.cpp  |  6 +--
 .../unified_memory_api/scatter_usm_legacy.cpp | 21 ++++++++++
 .../ESIMD/unified_memory_api/slm_scatter.cpp  |  8 ++--
 .../unified_memory_api/slm_scatter_legacy.cpp | 20 ++++++++++
 sycl/test/esimd/memory_properties.cpp         | 25 +++++++++---
 8 files changed, 137 insertions(+), 14 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp

diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index 6ec1102f402ba..15c5e7c9a625e 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) {
   CI.replaceAllUsesWith(LI);
 }
 
+static void translateScatterStore(CallInst &CI, bool IsSLM) {
+  IRBuilder<> Builder(&CI);
+  constexpr int AlignmentTemplateArgIdx = 2;
+  APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
+                               ESIMDIntrinDesc::GenXArgConversion::TO_I64);
+  Align AlignValue(Val.getZExtValue());
+
+  auto ValsOp = CI.getArgOperand(0);
+  auto OffsetsOp = CI.getArgOperand(1);
+  auto MaskOp = CI.getArgOperand(2);
+  auto DataType = ValsOp->getType();
+
+  // Convert the mask from <N x i16> to <N x i1>.
+  Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
+  MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);
+
+  // The address space may be 3-SLM, 1-global or private.
+  // At the moment of calling 'scatter()' operation the pointer passed to it
+  // is already 4-generic. Thus, simply use 4-generic for global and private
+  // and let GPU BE deduce the actual address space from the use-def graph.
+  unsigned AS = IsSLM ? 3 : 4;
+  auto ElemType = DataType->getScalarType();
+  auto NumElems = (cast<VectorType>(DataType))->getElementCount();
+  auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
+  auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);
+
+  auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp);
+  SI->setDebugLoc(CI.getDebugLoc());
+  CI.replaceAllUsesWith(SI);
+}
+
 // TODO Specify document behavior for slm_init and nbarrier_init when:
 // 1) they are called not from kernels
 // 2) there are multiple such calls reachable from a kernel
@@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
         continue;
       }
 
+      if (Name.starts_with("__esimd_scatter_st") ||
+          Name.starts_with("__esimd_slm_scatter_st")) {
+        translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st"));
+        ToErase.push_back(CI);
+        continue;
+      }
+
       if (Name.starts_with("__esimd_nbarrier_init")) {
         translateNbarrierInit(*CI);
         ToErase.push_back(CI);
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index d723d03353cb2..c9d0d6cc4d410 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_slm_gather_ld(
     __ESIMD_DNS::simd_mask_storage_t<N> pred,
     __ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;
 
+// Scatter data to given global or private addresses.
+template <typename T, int N, size_t Align>
+__ESIMD_INTRIN void
+__esimd_scatter_st(__ESIMD_DNS::vector_type_t<T, N> vals,
+                   __ESIMD_DNS::vector_type_t<uint64_t, N> vptr,
+                   __ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;
+
+// Scatter data to given SLM addresses.
+template <typename T, int N, size_t Align>
+__ESIMD_INTRIN void __esimd_slm_scatter_st(
+    __ESIMD_DNS::vector_type_t<T, N> vals,
+    __ESIMD_DNS::vector_type_t<uint32_t, N> vptr,
+    __ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;
+
 /// Surface-based gather.
 /// Supported platforms: DG2, PVC
 ///
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7f4833017d38a..4ffd5c418138c 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -724,11 +724,20 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 
   // Use LSC lowering if L1/L2 or VS > 1.
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
-                VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                VS > 1 ||
+                (!__ESIMD_DNS::isPowerOf2(N, 32) &&
+                 !detail::isMaskedGatherScatterLLVMAvailable())) {
     static_assert(VS == 1 || sizeof(T) >= 4,
                   "VS > 1 is supprted only for 4- and 8-byte elements");
     return detail::scatter_impl<T, VS, detail::lsc_data_size::default_size,
                                 L1Hint, L2Hint>(p, byte_offsets, vals, mask);
+  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
+    simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
+    Addrs = Addrs + convert<uint64_t>(byte_offsets);
+    using MsgT = detail::__raw_t<T>;
+    __esimd_scatter_st<MsgT, N, Alignment>(
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
+        Addrs.data(), mask.data());
   } else {
     using Tx = detail::__raw_t<T>;
     simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
@@ -4227,9 +4236,15 @@ slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
                 "slm_scatter() requires at least element-size alignment");
 
   // Use LSC lowering if VS > 1.
-  if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) {
+  if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
+                           !detail::isMaskedGatherScatterLLVMAvailable())) {
     __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
         byte_offsets, vals, mask);
+  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
+    using MsgT = detail::__raw_t<T>;
+    __esimd_slm_scatter_st<MsgT, N, Alignment>(
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
+        byte_offsets.data(), mask.data());
   } else {
     detail::LocalAccessorMarker acc;
     detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
index 929d3c6fc04f7..ff331a421ccef 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
@@ -5,13 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===------------------------------------------------------------------===//
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
 // RUN: %{run} %t.out
 
 // The test verifies esimd::scatter() functions accepting USM pointer
 // and optional compile-time esimd::properties.
-// The scatter() calls in this test do not use cache-hint
-// properties to not impose using DG2/PVC features.
+// The scatter() calls in this test do not use cache-hint properties
+// or VS > 1 (number of stores per offset) to not impose using PVC features.
 
 #include "Inputs/scatter.hpp"
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
new file mode 100644
index 0000000000000..08ac29ba6b605
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
@@ -0,0 +1,21 @@
+//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test do not use cache-hint properties
+// or VS > 1 (number of stores per offset) to not impose using PVC features.
+//
+// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is
+// resolved and ESIMD starts using llvm.masked.scatter by default.
+// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here.
+
+#include "scatter_usm.cpp"
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
index 96c49fcdae8af..ffa0a718e7689 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
@@ -5,12 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===------------------------------------------------------------------===//
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
 // RUN: %{run} %t.out
 
-// The test verifies esimd::slm_scatter() functions accepting
-// optional compile-time esimd::properties.
-// The scatter() calls in this test do not use DG2/PVC features.
+// The test verifies esimd::slm_scatter() functions accepting  optional
+// compile-time esimd::properties. The slm_scatter() calls in this test do not
+// use VS > 1 (number of stores per offset) to not impose using PVC features.
 
 #include "Inputs/scatter.hpp"
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp
new file mode 100644
index 0000000000000..0aca53311ef6b
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp
@@ -0,0 +1,20 @@
+//==------- slm_scatter_legacy.cpp - DPC++ ESIMD on-device test -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting  optional
+// compile-time esimd::properties. The slm_scatter() calls in this test do not
+// use VS > 1 (number of stores per offset) to not impose using PVC features.
+//
+// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is
+// resolved and ESIMD starts using llvm.masked.scatter by default.
+// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here.
+
+#include "slm_scatter.cpp"
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index eb629935347db..8305bd9b83b18 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
   acc_res = gather<float, 32>(local_acc, ioffset_n32, 0);
   acc_res = gather<float, 32>(local_acc, ioffset_n32, 0, mask_n32);
 
-  // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
   scatter(ptrf, ioffset_n32, usm, mask_n32);
 
   scatter(ptrf, ioffset_n32, usm);
@@ -1281,6 +1281,14 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
   scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16);
 
   scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view);
+
+  simd<uint32_t, 10> ioffset_n10(byte_offset32, 8);
+  simd<float, 10> usm_n10;
+
+  // Check special case to verify that for cases when N is not power of 2 llvm
+  // intrinsic is used
+  // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})
+  scatter(ptrf, ioffset_n10, usm_n10);
 }
 
 // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}
@@ -1381,26 +1389,26 @@ test_slm_gather_scatter(int byte_offset32) {
   // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1.
 
   // 1) slm_scatter(offsets): offsets is simd or simd_view
-  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm);
   slm_scatter<float, 32>(ioffset_n32_view, slm);
   slm_scatter<float, 32>(ioffset_n32, slm_view);
   slm_scatter<float, 32>(ioffset_n32_view, slm_view);
 
-  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, props_align8);
   slm_scatter<float, 32>(ioffset_n32_view, slm, props_align8);
   slm_scatter<float, 32>(ioffset_n32, slm_view, props_align8);
   slm_scatter<float, 32>(ioffset_n32_view, slm_view, props_align8);
 
   // 2) slm_gather(offsets, mask): offsets is simd or simd_view
-  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, mask_n32);
   slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32);
   slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32);
   slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32);
 
-  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, mask_n32, props_align8);
   slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32, props_align8);
   slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32, props_align8);
@@ -1429,4 +1437,11 @@ test_slm_gather_scatter(int byte_offset32) {
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16, props_align4);
   slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16, props_align4);
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16, props_align4);
+
+  simd<uint32_t, 10> ioffset_n10(byte_offset32, 8);
+  simd<float, 10> usm_n10;
+  // Check special case to verify that for cases when N is not power of 2 llvm
+  // intrinsic is used
+  // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})
+  slm_scatter(ioffset_n10, usm_n10);
 }

From 34051496442355315f6f096c6022dbff9b2e510c Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 8 Feb 2024 15:11:35 -0600
Subject: [PATCH 21/26] [ESIMD] Fix obsolete TODO comments + enable more cases
 in ballot.cpp (#12657)

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 sycl/test-e2e/ESIMD/api/ballot.cpp       | 12 +++++-------
 sycl/test-e2e/ESIMD/dpas/dpas_common.hpp |  5 +----
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/api/ballot.cpp b/sycl/test-e2e/ESIMD/api/ballot.cpp
index a636bcee147e0..4f87e562972db 100644
--- a/sycl/test-e2e/ESIMD/api/ballot.cpp
+++ b/sycl/test-e2e/ESIMD/api/ballot.cpp
@@ -71,15 +71,13 @@ template <class T, int N> bool test(queue &Q) {
 template <class T> bool test(queue &Q) {
   bool Pass = true;
 
-  // TODO: uncomment calls below once simd<...>.copy_from() starts supporting
-  // sizes other than 8, 16 and 32.
-  //  Pass &= test<T, 4>(Q);
+  Pass &= test<T, 4>(Q);
   Pass &= test<T, 8>(Q);
-  //  Pass &= test<T, 12>(Q);
+  Pass &= test<T, 12>(Q);
   Pass &= test<T, 16>(Q);
-  //  Pass &= test<T, 20>(Q);
-  //  Pass &= test<T, 24>(Q);
-  //  Pass &= test<T, 28>(Q);
+  Pass &= test<T, 20>(Q);
+  Pass &= test<T, 24>(Q);
+  Pass &= test<T, 28>(Q);
   Pass &= test<T, 32>(Q);
 
   return Pass;
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
index 3d50a0915afe1..4c9e528646545 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
@@ -94,7 +94,6 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
   static constexpr bool is_bf16 = T == dpas_argument_type::bf16;
   static constexpr bool is_tf32 = T == dpas_argument_type::tf32;
 
-  // TODO: support tf32 here.
   using type = std::conditional_t<
       is_sint, signed char,
       std::conditional_t<
@@ -149,7 +148,7 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col,
   ElemT *Vec = reinterpret_cast<ElemT *>(VVec);
 
   // 1. Find and read the target 'unsigned int' element.
-  // THe unpacked matrix has dimensions: NumRows*NumCols
+  // The unpacked matrix dimensions are NumRows*NumCols.
   constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize;
   int UnpackedLinearIndex = Row * NumCols + Col;
   int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT;
@@ -160,7 +159,6 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col,
   } else {
     ElemT TargetElem = Vec[PackedLinearIndex];
     // TargetElem has 2 or more elements in it. Need to extract one.
-    // TODO: for now assume that is the case only for 2 or 4-bit integers.
     assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type");
 
     unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize;
@@ -196,7 +194,6 @@ ReadT readFromHorizontallyPackedMatrix(void *VVec, int Row, int Col) {
     return static_cast<ReadT>(TargetElem);
   } else {
     // TargetElem has 2 or more elements in it. Need to extract one.
-    // TODO: for now assume that is the case only for 2 or 4-bit integers.
     assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type");
     unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize;
     unsigned int Mask = (static_cast<uint64_t>(1) << ElemBitSize) - 1;

From dbd65abb45fea9ca0fd80feba56085141acf0944 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 8 Feb 2024 14:30:32 -0800
Subject: [PATCH 22/26] [NFC][SYCL] Rename builtin_enabler_* for bitselect
 (#12664)

---
 .../detail/builtins/relational_functions.inc    | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
index 0a38d79d20589..fb0b3f7682b0a 100644
--- a/sycl/include/sycl/detail/builtins/relational_functions.inc
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -14,7 +14,7 @@ namespace sycl {
 inline namespace _V1 {
 namespace detail {
 template <typename T>
-struct any_elem_type
+struct bitselect_elem_type
     : std::bool_constant<check_type_in_v<
           get_elem_type_t<T>, float, double, half, char, signed char, short,
           int, long, long long, unsigned char, unsigned short, unsigned int,
@@ -29,12 +29,8 @@ struct rel_ret_traits
 };
 } // namespace detail
 
-BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type,
-                       any_shape, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type,
-                       any_elem_type, scalar_only, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type,
-                       any_elem_type, non_scalar_only, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_bitselect, default_ret_type,
+                       bitselect_elem_type, any_shape, same_elem_type)
 BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type,
                        non_scalar_only, same_elem_type)
 
@@ -133,13 +129,14 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet)
 
 #ifdef __SYCL_DEVICE_ONLY__
 DEVICE_IMPL_TEMPLATE(
-    THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) {
-      using ret_ty = detail::builtin_enable_generic_t<THREE_ARGS_TEMPLATE_TYPE>;
+    THREE_ARGS, bitselect, builtin_enable_bitselect_t, [](auto... xs) {
+      using ret_ty =
+          detail::builtin_enable_bitselect_t<THREE_ARGS_TEMPLATE_TYPE>;
       using detail::builtins::convert_result;
       return convert_result<ret_ty>(__spirv_ocl_bitselect(xs...));
     })
 #else
-HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel,
+HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_bitselect_t, rel,
                    default_ret_type)
 #endif
 

From d69a4c2ac50eddee27425a0a07b78bc84e194dfd Mon Sep 17 00:00:00 2001
From: ldrumm <ldrumm@rtps.co>
Date: Thu, 8 Feb 2024 23:21:16 +0000
Subject: [PATCH 23/26] Revert "[ESIMD][E2E] Enable few cases in atomic_update
 after gpu driver update" (#12669)

Reverts intel/llvm#12639

This regressed atomic_update_acc_dg2_pvc_cmpxchg.cpp
---
 .../Inputs/atomic_update.hpp                  | 16 ++++++++++----
 .../Inputs/atomic_update_slm.hpp              | 22 ++++++++++++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
index ef8646ae4892a..3da82ee8e15d1 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
@@ -715,10 +715,13 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
         q, cfg);
     passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
         q, cfg);
-    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
-        q, cfg);
-    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
-        q, cfg);
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
+    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
+    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
+#endif
   }
 
   return passed;
@@ -739,8 +742,13 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) {
     passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
 
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
     passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
+#endif
   }
   return passed;
 }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
index 6b69d99158cb6..92f000711e137 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -612,9 +612,14 @@ bool test_fp_types(queue q) {
 
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
-    if (q.get_device().has(sycl::aspect::atomic64) &&
-        q.get_device().has(sycl::aspect::fp64)) {
-      passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
+    // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
+    if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
+                  !std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
+                  !std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
+      if (q.get_device().has(sycl::aspect::atomic64) &&
+          q.get_device().has(sycl::aspect::fp64)) {
+        passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
+      }
     }
   }
   return passed;
@@ -628,6 +633,7 @@ bool test_int_types_and_sizes(queue q) {
   passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
+  // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
   if (UseMask && Features == TestFeatures::Generic &&
       esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
                                 "26918", "101.4953", false)) {
@@ -639,8 +645,13 @@ bool test_int_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q);
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
     passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q);
     passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q);
+#endif
   }
 
   return passed;
@@ -661,8 +672,13 @@ bool test_fp_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q);
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
     passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q);
     passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q);
+#endif
   }
   return passed;
 }

From db04b1ee8e955d342e07ad2cc2a02dbd4317be93 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 8 Feb 2024 16:01:15 -0800
Subject: [PATCH 24/26] [NFC][SYCL] Move builtin tests
 sycl/test/basic_tests/{*builtins*.cpp,builtins/} (#12667)

While at it, fix formatting and modernize include to SYCL 2020
(`<sycl/sycl.hpp>`).
---
 .../builtins_implicitly_convertible_args.cpp       |  0
 .../{ => builtins}/builtins_templates.cpp          |  0
 .../{ => builtins}/relational_builtins.cpp         | 14 +++++++-------
 3 files changed, 7 insertions(+), 7 deletions(-)
 rename sycl/test/basic_tests/{ => builtins}/builtins_implicitly_convertible_args.cpp (100%)
 rename sycl/test/basic_tests/{ => builtins}/builtins_templates.cpp (100%)
 rename sycl/test/basic_tests/{ => builtins}/relational_builtins.cpp (96%)

diff --git a/sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp b/sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp
similarity index 100%
rename from sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp
rename to sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp
diff --git a/sycl/test/basic_tests/builtins_templates.cpp b/sycl/test/basic_tests/builtins/builtins_templates.cpp
similarity index 100%
rename from sycl/test/basic_tests/builtins_templates.cpp
rename to sycl/test/basic_tests/builtins/builtins_templates.cpp
diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/builtins/relational_builtins.cpp
similarity index 96%
rename from sycl/test/basic_tests/relational_builtins.cpp
rename to sycl/test/basic_tests/builtins/relational_builtins.cpp
index ab9f3aa5c4326..eabd7e26c35cb 100644
--- a/sycl/test/basic_tests/relational_builtins.cpp
+++ b/sycl/test/basic_tests/builtins/relational_builtins.cpp
@@ -4,7 +4,7 @@
 // NOTE: Compile the test fully to ensure the library exports the right host
 // symbols.
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 // Some helper macros to verify return type of the builtins. To be used like
 // this
@@ -19,15 +19,15 @@ template <class... Args> struct CheckHelper {
   template <class F> static auto call(F f) { return f(Args()...); }
 };
 
-#define CHECK(EXPECTED, FUNC, ...)                            \
+#define CHECK(EXPECTED, FUNC, ...)                                             \
   {                                                                            \
     auto ret = CheckHelper<__VA_ARGS__>::call(                                 \
-        [](auto... args) { return cl::sycl::FUNC(args...); });                 \
-    static_assert(std::is_same_v<decltype(ret), EXPECTED>);                \
+        [](auto... args) { return sycl::FUNC(args...); });                     \
+    static_assert(std::is_same_v<decltype(ret), EXPECTED>);                    \
   }
 
 void foo() {
-  using namespace cl::sycl;
+  using namespace sycl;
   using boolm = marray<bool, 2>;
 
   using int16v = vec<int16_t, 2>;
@@ -247,9 +247,9 @@ void foo() {
 }
 
 int main() {
-  cl::sycl::queue q;
+  sycl::queue q;
   foo(); // Verify host.
-  q.submit([&](cl::sycl::handler &cgh) {
+  q.submit([&](sycl::handler &cgh) {
     cgh.single_task<class test>([]() {
       foo(); // verify device
     });

From 6098a75b3092da2a9efc34b169b09e76e0336b4c Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Thu, 8 Feb 2024 23:40:51 -0800
Subject: [PATCH 25/26] [SYCL][E2E] Fix the test script after changing sycl-ls
 output (#12672)

In PR#https://github.com/intel/llvm/pull/12596, we changed the output of
sycl-ls to use 'fpga' instead of 'acc'. This has caused several tests to
fail when fpga is available on the system. This PR fixes that.

In retrospect, we should have enabled testing on FPGA in pre-commit
testing as this bug could have been caught earlier.
---
 sycl/test-e2e/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index a25f775edc9e1..ef44dc972797b 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -669,7 +669,7 @@
     features.update(sg_size_features)
 
     be, dev = sycl_device.split(":")
-    features.add(dev.replace("acc", "accelerator"))
+    features.add(dev.replace("fpga", "accelerator"))
     # Use short names for LIT rules.
     features.add(be)
 

From 1f37b5ed804123d9068d78a739f8451584fdafca Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Fri, 9 Feb 2024 02:33:50 -0800
Subject: [PATCH 26/26] [SYCL] Fix SFINAE rules for integer builtins/bitselect
 (#12671)

In case of vectors/swizzles of integer types only fixed width types are
allowed per SYCL 2020 revision 8. Update the implementation to match
that.
---
 sycl/include/sycl/builtins_preview.hpp        |   9 +-
 sycl/include/sycl/builtins_utils_scalar.hpp   |  11 ++
 .../sycl/detail/builtins/helper_macros.hpp    |   8 ++
 .../detail/builtins/integer_functions.inc     |   7 +-
 .../detail/builtins/relational_functions.inc  |  10 +-
 sycl/source/builtins/host_helper_macros.hpp   |   7 +
 sycl/source/builtins/integer_functions.cpp    |   3 +-
 sycl/source/builtins/relational_functions.cpp |   3 +-
 .../builtins/builtin_unit_tests.cpp           | 133 ++++++++++++++++++
 9 files changed, 181 insertions(+), 10 deletions(-)
 create mode 100644 sycl/test/basic_tests/builtins/builtin_unit_tests.cpp

diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp
index dfc44e5848f91..91fd50d2ec4f2 100644
--- a/sycl/include/sycl/builtins_preview.hpp
+++ b/sycl/include/sycl/builtins_preview.hpp
@@ -137,7 +137,14 @@ auto builtin_marray_impl(FuncTy F, const Ts &...x) {
   marray<ret_elem_type, T::size()> Res;
   constexpr auto N = T::size();
   for (size_t I = 0; I < N / 2; ++I) {
-    auto PartialRes = F(to_vec2(x, I * 2)...);
+    auto PartialRes = [&]() {
+      using elem_ty = get_elem_type_t<T>;
+      if constexpr (std::is_integral_v<elem_ty>)
+        return F(to_vec2(x, I * 2)
+                     .template as<vec<get_fixed_sized_int_t<elem_ty>, 2>>()...);
+      else
+        return F(to_vec2(x, I * 2)...);
+    }();
     std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes)));
   }
   if (N % 2)
diff --git a/sycl/include/sycl/builtins_utils_scalar.hpp b/sycl/include/sycl/builtins_utils_scalar.hpp
index 62d86df045cdf..31f267892243b 100644
--- a/sycl/include/sycl/builtins_utils_scalar.hpp
+++ b/sycl/include/sycl/builtins_utils_scalar.hpp
@@ -128,6 +128,17 @@ template <size_t Size> struct get_unsigned_int_by_size {
 template <typename T> struct same_size_unsigned_int {
   using type = typename get_unsigned_int_by_size<sizeof(T)>::type;
 };
+template <typename T>
+using same_size_unsigned_int_t = typename same_size_unsigned_int<T>::type;
+
+template <typename T> struct get_fixed_sized_int {
+  static_assert(std::is_integral_v<T>);
+  using type =
+      std::conditional_t<std::is_signed_v<T>, same_size_signed_int_t<T>,
+                         same_size_unsigned_int_t<T>>;
+};
+template <typename T>
+using get_fixed_sized_int_t = typename get_fixed_sized_int<T>::type;
 
 // Utility trait for getting an upsampled integer type.
 // NOTE: For upsampling we look for an integer of double the size of the
diff --git a/sycl/include/sycl/detail/builtins/helper_macros.hpp b/sycl/include/sycl/detail/builtins/helper_macros.hpp
index 49d4af8981d70..38014c4b62c8f 100644
--- a/sycl/include/sycl/detail/builtins/helper_macros.hpp
+++ b/sycl/include/sycl/detail/builtins/helper_macros.hpp
@@ -48,6 +48,11 @@
   FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
                ARG4, ARG5, ARG6)                                               \
   BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG7)
+#define FOR_EACH4_A8(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4, ARG5, ARG6, ARG7, ARG8)                       \
+  FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4, ARG5, ARG6, ARG7)                                         \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG8)
 #define FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,   \
                       ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11)  \
   FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
@@ -169,6 +174,9 @@
   unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long
 // 11 types
 #define INTEGER_TYPES SIGNED_TYPES, UNSIGNED_TYPES
+// 8 types
+#define FIXED_WIDTH_INTEGER_TYPES                                              \
+  int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t
 
 #define DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(                                  \
     NUM_ARGS, NAME, ENABLER, DELEGATOR, NS, /*SCALAR_VEC_IMPL*/...)            \
diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc
index dab0cee8a647a..44699765ff7fd 100644
--- a/sycl/include/sycl/detail/builtins/integer_functions.inc
+++ b/sycl/include/sycl/detail/builtins/integer_functions.inc
@@ -16,9 +16,10 @@ namespace detail {
 template <typename T>
 struct integer_elem_type
     : std::bool_constant<
-          check_type_in_v<get_elem_type_t<T>, char, signed char, short, int,
-                          long, long long, unsigned char, unsigned short,
-                          unsigned int, unsigned long, unsigned long long>> {};
+          (is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, FIXED_WIDTH_INTEGER_TYPES>) ||
+          (!is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, INTEGER_TYPES>)> {};
 template <typename T>
 struct suint32_elem_type
     : std::bool_constant<
diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
index fb0b3f7682b0a..d63a7716aa9b7 100644
--- a/sycl/include/sycl/detail/builtins/relational_functions.inc
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -15,10 +15,12 @@ inline namespace _V1 {
 namespace detail {
 template <typename T>
 struct bitselect_elem_type
-    : std::bool_constant<check_type_in_v<
-          get_elem_type_t<T>, float, double, half, char, signed char, short,
-          int, long, long long, unsigned char, unsigned short, unsigned int,
-          unsigned long, unsigned long long>> {};
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, FP_TYPES> ||
+          (is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, FIXED_WIDTH_INTEGER_TYPES>) ||
+          (!is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, INTEGER_TYPES>)> {};
 
 template <typename T>
 struct rel_ret_traits
diff --git a/sycl/source/builtins/host_helper_macros.hpp b/sycl/source/builtins/host_helper_macros.hpp
index 484b0bc95fb8b..41aac2148db71 100644
--- a/sycl/source/builtins/host_helper_macros.hpp
+++ b/sycl/source/builtins/host_helper_macros.hpp
@@ -56,6 +56,9 @@
 #define EXPORT_VEC(NUM_ARGS, NAME, TYPE, VL)                                   \
   EXPORT_VEC_NS(NUM_ARGS, NAME, sycl, TYPE, VL)
 
+#define EXPORT_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE)                         \
+  FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE)
+
 #define EXPORT_SCALAR_AND_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE)              \
   EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE)                                   \
   FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE)
@@ -69,8 +72,12 @@
 
 #define EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...)                 \
   FOR_EACH3(EXPORT_SCALAR_AND_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__)
+#define EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...)                            \
+  FOR_EACH3(EXPORT_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__)
 #define EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, ...)                        \
   EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__)
+#define EXPORT_VEC_1_16(NUM_ARGS, NAME, ...)                                   \
+  EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__)
 
 #define EXPORT_SCALAR_AND_VEC_2_4(NUM_ARGS, NAME, ...)                         \
   FOR_EACH2(EXPORT_SCALAR_AND_VEC_2_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__)
diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp
index 381d6f1fa0a10..cd92b2180df73 100644
--- a/sycl/source/builtins/integer_functions.cpp
+++ b/sycl/source/builtins/integer_functions.cpp
@@ -76,7 +76,8 @@ namespace sycl {
 inline namespace _V1 {
 #define BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)                                   \
   HOST_IMPL(NAME, IMPL)                                                        \
-  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, INTEGER_TYPES)
+  FOR_EACH2(EXPORT_SCALAR, NUM_ARGS, NAME, INTEGER_TYPES)                      \
+  EXPORT_VEC_1_16(NUM_ARGS, NAME, FIXED_WIDTH_INTEGER_TYPES)
 #define BUILTIN_GENINT_SU(NUM_ARGS, NAME, IMPL)                                \
   BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)
 
diff --git a/sycl/source/builtins/relational_functions.cpp b/sycl/source/builtins/relational_functions.cpp
index b54c55e283e5e..b8b7795f6fb79 100644
--- a/sycl/source/builtins/relational_functions.cpp
+++ b/sycl/source/builtins/relational_functions.cpp
@@ -103,6 +103,7 @@ HOST_IMPL(bitselect, [](auto x, auto y, auto z) {
   assert((ures & std::numeric_limits<utype>::max()) == ures);
   return bit_cast<T0>(static_cast<utype>(ures));
 })
-EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES)
+FOR_EACH2(EXPORT_SCALAR, THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES)
+EXPORT_VEC_1_16(THREE_ARGS, bitselect, FIXED_WIDTH_INTEGER_TYPES, FP_TYPES)
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp
new file mode 100644
index 0000000000000..d241a90568fc3
--- /dev/null
+++ b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp
@@ -0,0 +1,133 @@
+// RUN: %clangxx -fsycl -fpreview-breaking-changes -fsyntax-only %s -Xclang -verify
+// REQUIRES: preview-breaking-changes-supported
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::detail;
+
+namespace builtin_same_shape_v_tests {
+using swizzle1 = decltype(std::declval<vec<float, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<float, 2>>().swizzle<0, 0>());
+using swizzle3 = decltype(std::declval<vec<float, 2>>().swizzle<0, 0, 1>());
+
+static_assert(builtin_same_shape_v<float>);
+static_assert(builtin_same_shape_v<int, float>);
+static_assert(builtin_same_shape_v<marray<int, 2>>);
+static_assert(builtin_same_shape_v<marray<int, 2>, marray<float, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>, vec<float, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>, swizzle2>);
+
+static_assert(!builtin_same_shape_v<float, marray<float, 1>>);
+static_assert(!builtin_same_shape_v<float, vec<float, 1>>);
+static_assert(!builtin_same_shape_v<marray<float, 1>, vec<float, 1>>);
+static_assert(!builtin_same_shape_v<float, swizzle1>);
+static_assert(!builtin_same_shape_v<marray<float, 1>, swizzle1>);
+static_assert(!builtin_same_shape_v<swizzle2, swizzle1>);
+} // namespace builtin_same_shape_v_tests
+
+namespace builtin_marray_impl_tests {
+// Integer functions/relational bitselect only accept fixed-width integer
+// element types for vector/swizzle elements. Make sure that our marray->vec
+// delegator can handle that.
+
+auto foo(char x) { return x; }
+auto foo(signed char x) { return x; }
+auto foo(unsigned char x) { return x; }
+auto foo(vec<int8_t, 2> x) { return x; }
+auto foo(vec<uint8_t, 2> x) { return x; }
+
+auto test() {
+  marray<char, 2> x;
+  marray<signed char, 2> y;
+  marray<unsigned char, 2> z;
+  auto TestOne = [](auto x) {
+    std::ignore = builtin_marray_impl([](auto x) { return foo(x); }, x);
+  };
+  TestOne(x);
+  TestOne(y);
+  TestOne(z);
+}
+} // namespace builtin_marray_impl_tests
+
+namespace builtin_enable_integer_tests {
+using swizzle1 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0, 0>());
+template <typename... Ts> void ignore() {}
+
+void test() {
+  // clang-format off
+  ignore<builtin_enable_integer_t<char>,
+         builtin_enable_integer_t<signed char>,
+         builtin_enable_integer_t<unsigned char>>();
+  // clang-format on
+
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>>,
+         builtin_enable_integer_t<vec<uint8_t, 2>>>();
+
+  ignore<builtin_enable_integer_t<char, char>>();
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>, vec<int8_t, 2>>>();
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>, swizzle2>>();
+  ignore<builtin_enable_integer_t<swizzle2, swizzle2>>();
+
+  {
+    // Only one of char/signed char maps onto int8_t. The other type isn't a
+    // valid vector element type for integer builtins.
+
+    static_assert(std::is_signed_v<char>);
+
+    // clang-format off
+    // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::integer_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, sycl::vec<{{.*}}, 2>>'}}
+    // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}}
+    ignore<builtin_enable_integer_t<vec<signed char, 2>>, builtin_enable_integer_t<vec<char, 2>>>();
+    // clang-format on
+  }
+
+  // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::integer_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, char, signed char>'}}
+  // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}}
+  ignore<builtin_enable_integer_t<char, signed char>>();
+}
+} // namespace builtin_enable_integer_tests
+
+namespace builtin_enable_bitselect_tests {
+// Essentially the same as builtin_enable_integer_t + FP types support.
+using swizzle1 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0, 0>());
+template <typename... Ts> void ignore() {}
+
+void test() {
+  // clang-format off
+  ignore<builtin_enable_bitselect_t<char>,
+         builtin_enable_bitselect_t<signed char>,
+         builtin_enable_bitselect_t<unsigned char>,
+         builtin_enable_bitselect_t<float>>();
+  // clang-format on
+
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>>,
+         builtin_enable_bitselect_t<vec<uint8_t, 2>>,
+         builtin_enable_bitselect_t<vec<float, 2>>>();
+
+  ignore<builtin_enable_bitselect_t<char, char>>();
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>, vec<int8_t, 2>>>();
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>, swizzle2>>();
+  ignore<builtin_enable_bitselect_t<swizzle2, swizzle2>>();
+
+  {
+    // Only one of char/signed char maps onto int8_t. The other type isn't a
+    // valid vector element type for integer builtins.
+
+    static_assert(std::is_signed_v<char>);
+
+    // clang-format off
+    // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::bitselect_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, sycl::vec<{{.*}}, 2>>'}}
+    // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}}
+    ignore<builtin_enable_bitselect_t<vec<signed char, 2>>, builtin_enable_bitselect_t<vec<char, 2>>>();
+    // clang-format on
+  }
+
+  // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::bitselect_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, char, signed char>'}}
+  // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}}
+  ignore<builtin_enable_bitselect_t<char, signed char>>();
+}
+} // namespace builtin_enable_bitselect_tests