From 0bf2e666c1d59ba5435bfa1d30f0c3ecb2db640d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 31 Jan 2024 18:21:26 +0000
Subject: [PATCH 01/14] [SYCL][ESIMD] Implement unified memory API for
 scatter(usm, ...) (#12510)

This implements the unified memory API for scatter with USM pointers.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 264 +++++++++++++++---
 .../ext/intel/experimental/esimd/common.hpp   |  10 -
 .../ext/intel/experimental/esimd/memory.hpp   |  25 +-
 .../unified_memory_api/Inputs/scatter.hpp     | 227 +++++++++++++++
 .../ESIMD/unified_memory_api/scatter_usm.cpp  |  37 +++
 .../scatter_usm_dg2_pvc.cpp                   |  38 +++
 sycl/test/esimd/memory_properties.cpp         |  60 ++++
 7 files changed, 597 insertions(+), 64 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7af684ee99cfb..5d8f13ce82619 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -179,6 +179,45 @@ __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
   return lsc_format_ret<T>(Result);
 }
 
+/// USM pointer scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to specific address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
+          cache_hint L2H, int N, typename Toffset>
+__ESIMD_API void scatter_impl(T *p, simd<Toffset, N> offsets,
+                              simd<T, N * NElts> vals, simd_mask<N> pred) {
+  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
+  check_cache_hint<cache_action::store, L1H, L2H>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
+  __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
+                              Transposed, N>(pred.data(), addrs.data(),
+                                             Tmp.data());
+}
+
 // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter.
 // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is
 // not used because of an issue in GPU driver, which does not recognize
@@ -616,44 +655,203 @@ gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
   return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
 }
 
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-1)
+
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	PropertyListT props = {});                         // (usm-sc-2)
+
+/// The next two functions are similar to usm-sc-{1,2} with the 'byte_offsets'
+/// parameter represerented as 'simd_view'.
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///      PropertyListT props = {});                         // (usm-sc-4)
+
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-1)
+///
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
 /// value of the corresponding element in the input offset vector. Access to
 /// any element's memory location can be disabled via the input mask.
-/// @tparam Tx Element type, must be of size 4 or less.
-/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
-///   or \c 32.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
 /// @param p The base address.
-/// @param offsets A vector of 32-bit or 64-bit offsets in bytes. For each lane
-/// \c i,   ((byte*)p + offsets[i]) must be element size aligned.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
 /// @param vals The vector to scatter.
-/// @param mask The access mask, defaults to all 1s.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
+  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
+
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "scatter() requires at least element-size alignment");
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+
+  // Use LSC lowering if L1/L2 or VS > 1.
+  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
+                VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+    static_assert(VS == 1 || sizeof(T) >= 4,
+                  "VS > 1 is supprted only for 4- and 8-byte elements");
+    return detail::scatter_impl<T, VS, detail::lsc_data_size::default_size,
+                                L1Hint, L2Hint>(p, byte_offsets, vals, mask);
+  } else {
+    using Tx = detail::__raw_t<T>;
+    simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
+    simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
+    addrs = addrs + byte_offsets_i;
+    if constexpr (sizeof(T) == 1) {
+      simd<T, N * 4> D = __esimd_wrregion<Tx, N * 4, N, /*VS*/ 0, N, 4>(
+          D.data(), vals.data(), 0);
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
+                          detail::ElemsPerAddrEncoding<1>()>(
+          addrs.data(), D.data(), mask.data());
+    } else if constexpr (sizeof(T) == 2) {
+      simd<Tx, N * 2> D = __esimd_wrregion<Tx, N * 2, N, /*VS*/ 0, N, 2>(
+          D.data(), vals.data(), 0);
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
+                          detail::ElemsPerAddrEncoding<2>()>(
+          addrs.data(), D.data(), mask.data());
+    } else
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
+                          detail::ElemsPerAddrEncoding<1>()>(
+          addrs.data(), vals.data(), mask.data());
+  }
+}
+
+// template <typename T, int N, int VS = 1, typename OffsetT,
+// 	  typename PropertyListT = empty_properties_t>
+// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+// 	PropertyListT props = {});                               // (usm-sc-2)
 ///
-template <typename Tx, int N, typename Toffset>
-__ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
-                         simd_mask<N> mask = 1) {
-  using T = detail::__raw_t<Tx>;
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
-  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
-  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
-  addrs = addrs + offsets_i;
-  if constexpr (sizeof(T) == 1) {
-    simd<T, N * 4> D;
-    D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
-                        detail::ElemsPerAddrEncoding<1>()>(
-        addrs.data(), D.data(), mask.data());
-  } else if constexpr (sizeof(T) == 2) {
-    simd<T, N * 2> D;
-    D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
-                        detail::ElemsPerAddrEncoding<2>()>(
-        addrs.data(), D.data(), mask.data());
-  } else
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
-                        detail::ElemsPerAddrEncoding<1>()>(
-        addrs.data(), vals.data(), mask.data());
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
+}
+
+// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+// 	  typename PropertyListT = empty_properties_t>
+// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
+///
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector. Access to
+/// any element's memory location can be disabled via the input mask.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///      PropertyListT props = {});                         // (usm-sc-4)
+///
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
 }
 
 /// A variation of \c scatter API with \c offsets represented as \c simd_view
@@ -671,7 +869,7 @@ __ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
 template <typename Tx, int N, typename OffsetObjT, typename RegionTy>
 __ESIMD_API void scatter(Tx *p, simd_view<OffsetObjT, RegionTy> offsets,
                          simd<Tx, N> vals, simd_mask<N> mask = 1) {
-  scatter<Tx, N>(p, offsets.read(), vals, mask);
+  scatter<Tx, N, 1>(p, offsets.read(), vals, mask);
 }
 
 /// A variation of \c scatter API with \c offsets represented as scalar.
@@ -688,7 +886,7 @@ __ESIMD_API void scatter(Tx *p, simd_view<OffsetObjT, RegionTy> offsets,
 template <typename Tx, int N, typename Toffset>
 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
-  scatter<Tx, N>(p, simd<Toffset, N>(offset), vals, mask);
+  scatter<Tx, N, 1>(p, simd<Toffset, N>(offset), vals, mask);
 }
 
 namespace detail {
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
index 774154dfc0f97..a71981de54bec 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
@@ -102,16 +102,6 @@ template <typename T> struct lsc_expand_type {
   using type = __ESIMD_DNS::lsc_expand_type<T>::type;
 };
 
-template <typename T> struct lsc_bitcast_type {
-public:
-  using type = std::conditional_t<
-      sizeof(T) == 1, uint8_t,
-      std::conditional_t<
-          sizeof(T) == 2, uint16_t,
-          std::conditional_t<sizeof(T) == 4, uint32_t,
-                             std::conditional_t<sizeof(T) == 8, uint64_t, T>>>>;
-};
-
 } // namespace detail
 
 /// L1 or L3 cache hint kinds.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 069898f14e747..7fb0ae833e793 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1434,7 +1434,7 @@ __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::nontranspose;
   using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = typename detail::lsc_bitcast_type<T>::type;
+  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
   __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
   __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
                         _ImmOffset, _DS, _VS, _Transposed, N>(
@@ -1487,25 +1487,8 @@ template <typename T, int NElts = 1,
 __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
                              __ESIMD_NS::simd<T, N * NElts> vals,
                              __ESIMD_NS::simd_mask<N> pred = 1) {
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using _CstT = typename detail::lsc_bitcast_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
-  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
-  addrs += convert<uintptr_t>(offsets);
-  __esimd_lsc_store_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
-                              _VS, _Transposed, N>(pred.data(), addrs.data(),
-                                                   Tmp.data());
+  __ESIMD_DNS::scatter_impl<T, NElts, DS, L1H, L3H, N, Toffset>(p, offsets,
+                                                                vals, pred);
 }
 
 template <typename T, int NElts = 1,
@@ -1573,7 +1556,7 @@ lsc_scatter(AccessorTy acc,
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::nontranspose;
   using MsgT = typename detail::lsc_expand_type<T>::type;
-  using _CstT = typename detail::lsc_bitcast_type<T>::type;
+  using _CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
   __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
   auto si = __ESIMD_NS::get_surface_index(acc);
   __esimd_lsc_store_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
new file mode 100644
index 0000000000000..270ecc4e698af
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -0,0 +1,227 @@
+//==------- scatter.hpp - DPC++ ESIMD on-device test ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#include "common.hpp"
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+template <typename T>
+bool verify(const T *Out, int N, int Size, int VS, uint32_t MaskStride,
+            bool UseMask) {
+  using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+  int NumErrors = 0;
+  int NOffsets = N / VS;
+  for (uint32_t I = 0; I < Size; I += N) { // Verify by 1 vector at once
+    for (int VSI = 0; VSI < VS; VSI++) {
+      for (int OffsetI = 0; OffsetI < NOffsets; OffsetI++) {
+        size_t OutIndex = I + VSI * NOffsets + OffsetI;
+        bool IsMaskSet = UseMask ? ((OutIndex / VS) % MaskStride == 0) : true;
+        Tuint Expected = sycl::bit_cast<Tuint>((T)OutIndex);
+        if (!UseMask || IsMaskSet)
+          Expected = sycl::bit_cast<Tuint>((T)(OutIndex * 2));
+        Tuint Computed = sycl::bit_cast<Tuint>(Out[OutIndex]);
+        if (Computed != Expected && ++NumErrors < 16) {
+          std::cout << "Out[" << OutIndex << "] = " << std::to_string(Computed)
+                    << " vs " << std::to_string(Expected) << std::endl;
+        }
+      }
+    }
+  }
+  return NumErrors == 0;
+}
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testUSM(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  uint32_t Groups = 8;
+  uint32_t Threads = 16;
+  size_t Size = Groups * Threads * N;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), Q));
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         simd<int32_t, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
+                                             VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+         simd<T, N> Vals = gather<T, N, VS>(Out, ByteOffsets);
+         Vals *= 2;
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter(Out, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter(Out, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView);
+             }
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask);
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testUSM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  Passed &= testUSM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // // Test scatter() without passing compile-time properties argument.
+  Passed &= testUSM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{cache_hint_L1<cache_hint::streaming>,
+                        cache_hint_L2<cache_hint::uncached>,
+                        alignment<sizeof(T)>};
+    Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testUSM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testUSM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testUSM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
new file mode 100644
index 0000000000000..929d3c6fc04f7
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
@@ -0,0 +1,37 @@
+//==------- scatter_usm.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test do not use cache-hint
+// properties to not impose using DG2/PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testUSM<int8_t, TestFeatures>(Q);
+  Passed &= testUSM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testUSM<sycl::half, TestFeatures>(Q);
+  Passed &= testUSM<uint32_t, TestFeatures>(Q);
+  Passed &= testUSM<float, TestFeatures>(Q);
+  Passed &= testUSM<int64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testUSM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp
new file mode 100644
index 0000000000000..aa466795e9b06
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp
@@ -0,0 +1,38 @@
+//==------- scatter_usm_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test uses cache-hint
+// properties and requires DG2 or PVC.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testUSM<int8_t, TestFeatures>(Q);
+  Passed &= testUSM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testUSM<sycl::half, TestFeatures>(Q);
+  Passed &= testUSM<uint32_t, TestFeatures>(Q);
+  Passed &= testUSM<float, TestFeatures>(Q);
+  Passed &= testUSM<int64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testUSM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 737d7b4fabfad..31dbc3e889f4b 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -969,6 +969,8 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
   simd<float, 32> pass_thru;
   auto pass_thru_view = pass_thru.select<32, 1>();
 
+  auto usm_view = usm.select<32, 1>();
+
   // Test USM and ACC gather using this plan:
   // 1) gather(usm, offsets): offsets is simd or simd_view
   // 2) gather(usm, offsets, mask): offsets is simd or simd_view
@@ -1151,6 +1153,64 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
                                  props_align4);
   acc_res = gather<float, 32, 2>(acc, ioffset_n16_view, mask_n16,
                                  pass_thru_view, props_align4);
+
+  // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
+  scatter(ptrf, ioffset_n32, usm, mask_n32);
+
+  scatter(ptrf, ioffset_n32, usm);
+
+  scatter(ptrf, ioffset_n32, usm, mask_n32, props_align4);
+
+  scatter(ptrf, ioffset_n32, usm, props_align4);
+
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v32i1.v32i64.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter(ptrf, ioffset_n32, usm, mask_n32, props_cache_load);
+  scatter(ptrf, ioffset_n32, usm, props_cache_load);
+
+  scatter(ptrf, ioffset_n32_view, usm, mask_n32, props_cache_load);
+  scatter(ptrf, ioffset_n32_view, usm, props_cache_load);
+
+  scatter<float, 32>(ptrf, ioffset_n32, usm_view, mask_n32, props_cache_load);
+  scatter<float, 32>(ptrf, ioffset_n32, usm_view, props_cache_load);
+
+  scatter<float, 32>(ptrf, ioffset_n32_view, usm_view, mask_n32,
+                     props_cache_load);
+  scatter<float, 32>(ptrf, ioffset_n32_view, usm_view, props_cache_load);
+
+  // VS > 1
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, mask_n16, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, props_cache_load);
+
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view);
 }
 
 // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}

From 16a368c8a711b79e84940d2aab4d1d217d0c5697 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 31 Jan 2024 11:20:46 -0800
Subject: [PATCH 02/14] [CI][NFC] Unify naming scheme for SYCL workflows.
 (#12525)

All GitHub Actions workflows added by intel/llvm project follow similar naming notation:
1. Name starts with `sycl` prefix.
2. Use dash `-` instead of underscore `_` to separate words.
---
 .github/workflows/{sycl_aws.yml => sycl-aws.yml}                  | 0
 .github/workflows/{sycl_containers.yaml => sycl-containers.yaml}  | 0
 .../{sycl_detect_changes.yml => sycl-detect-changes.yml}          | 0
 .github/workflows/{gh_pages.yml => sycl-docs.yml}                 | 0
 .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml}  | 0
 ...ix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} | 0
 ...{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} | 0
 .../{sycl_linux_precommit.yml => sycl-linux-precommit.yml}        | 0
 .../{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml}        | 0
 ...ycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} | 0
 .github/workflows/{sycl_nightly.yml => sycl-nightly.yml}          | 0
 .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml}  | 0
 .../workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml}    | 0
 .github/workflows/{sync-main.yml => sycl-sync-main.yml}           | 0
 .../{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml}    | 0
 .../workflows/{sycl_windows_build.yml => sycl-windows-build.yml}  | 0
 .../{sycl_windows_precommit.yml => sycl-windows-precommit.yml}    | 0
 .../{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml}    | 0
 18 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{sycl_aws.yml => sycl-aws.yml} (100%)
 rename .github/workflows/{sycl_containers.yaml => sycl-containers.yaml} (100%)
 rename .github/workflows/{sycl_detect_changes.yml => sycl-detect-changes.yml} (100%)
 rename .github/workflows/{gh_pages.yml => sycl-docs.yml} (100%)
 rename .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml} (100%)
 rename .github/workflows/{sycl_linux_matrix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} (100%)
 rename .github/workflows/{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} (100%)
 rename .github/workflows/{sycl_linux_precommit.yml => sycl-linux-precommit.yml} (100%)
 rename .github/workflows/{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml} (100%)
 rename .github/workflows/{sycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} (100%)
 rename .github/workflows/{sycl_nightly.yml => sycl-nightly.yml} (100%)
 rename .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml} (100%)
 rename .github/workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml} (100%)
 rename .github/workflows/{sync-main.yml => sycl-sync-main.yml} (100%)
 rename .github/workflows/{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml} (100%)
 rename .github/workflows/{sycl_windows_build.yml => sycl-windows-build.yml} (100%)
 rename .github/workflows/{sycl_windows_precommit.yml => sycl-windows-precommit.yml} (100%)
 rename .github/workflows/{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml} (100%)

diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml
similarity index 100%
rename from .github/workflows/sycl_aws.yml
rename to .github/workflows/sycl-aws.yml
diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml
similarity index 100%
rename from .github/workflows/sycl_containers.yaml
rename to .github/workflows/sycl-containers.yaml
diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml
similarity index 100%
rename from .github/workflows/sycl_detect_changes.yml
rename to .github/workflows/sycl-detect-changes.yml
diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml
similarity index 100%
rename from .github/workflows/gh_pages.yml
rename to .github/workflows/sycl-docs.yml
diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml
similarity index 100%
rename from .github/workflows/sycl_linux_build.yml
rename to .github/workflows/sycl-linux-build.yml
diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
similarity index 100%
rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
similarity index 100%
rename from .github/workflows/sycl_linux_precommit_aws.yml
rename to .github/workflows/sycl-linux-precommit-aws.yml
diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml
similarity index 100%
rename from .github/workflows/sycl_linux_precommit.yml
rename to .github/workflows/sycl-linux-precommit.yml
diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_linux_run_tests.yml
rename to .github/workflows/sycl-linux-run-tests.yml
diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml
similarity index 100%
rename from .github/workflows/sycl_macos_build_and_test.yml
rename to .github/workflows/sycl-macos-build-and-test.yml
diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml
similarity index 100%
rename from .github/workflows/sycl_nightly.yml
rename to .github/workflows/sycl-nightly.yml
diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml
similarity index 100%
rename from .github/workflows/sycl_post_commit.yml
rename to .github/workflows/sycl-post-commit.yml
diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml
similarity index 100%
rename from .github/workflows/sycl_stale_issues.yml
rename to .github/workflows/sycl-stale-issues.yml
diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml
similarity index 100%
rename from .github/workflows/sync-main.yml
rename to .github/workflows/sycl-sync-main.yml
diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml
similarity index 100%
rename from .github/workflows/sycl_update_gpu_driver.yml
rename to .github/workflows/sycl-update-gpu-driver.yml
diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml
similarity index 100%
rename from .github/workflows/sycl_windows_build.yml
rename to .github/workflows/sycl-windows-build.yml
diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml
similarity index 100%
rename from .github/workflows/sycl_windows_precommit.yml
rename to .github/workflows/sycl-windows-precommit.yml
diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_windows_run_tests.yml
rename to .github/workflows/sycl-windows-run-tests.yml

From 1b5daa810a2178dc7fa094e7962a52b88f085998 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 31 Jan 2024 11:25:19 -0800
Subject: [PATCH 03/14] Revert "[CI][NFC] Unify naming scheme for SYCL
 workflows." (#12567)

Reverts intel/llvm#12525

In addition to file renaming, we need to update file names referenced
inside the workflow files.
---
 .github/workflows/{sycl-docs.yml => gh_pages.yml}                 | 0
 .github/workflows/{sycl-aws.yml => sycl_aws.yml}                  | 0
 .github/workflows/{sycl-containers.yaml => sycl_containers.yaml}  | 0
 .../{sycl-detect-changes.yml => sycl_detect_changes.yml}          | 0
 .github/workflows/{sycl-linux-build.yml => sycl_linux_build.yml}  | 0
 ...ix-e2e-on-nightly.yml => sycl_linux_matrix_e2e_on_nightly.yml} | 0
 .../{sycl-linux-precommit.yml => sycl_linux_precommit.yml}        | 0
 ...{sycl-linux-precommit-aws.yml => sycl_linux_precommit_aws.yml} | 0
 .../{sycl-linux-run-tests.yml => sycl_linux_run_tests.yml}        | 0
 ...ycl-macos-build-and-test.yml => sycl_macos_build_and_test.yml} | 0
 .github/workflows/{sycl-nightly.yml => sycl_nightly.yml}          | 0
 .github/workflows/{sycl-post-commit.yml => sycl_post_commit.yml}  | 0
 .../workflows/{sycl-stale-issues.yml => sycl_stale_issues.yml}    | 0
 .../{sycl-update-gpu-driver.yml => sycl_update_gpu_driver.yml}    | 0
 .../workflows/{sycl-windows-build.yml => sycl_windows_build.yml}  | 0
 .../{sycl-windows-precommit.yml => sycl_windows_precommit.yml}    | 0
 .../{sycl-windows-run-tests.yml => sycl_windows_run_tests.yml}    | 0
 .github/workflows/{sycl-sync-main.yml => sync-main.yml}           | 0
 18 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{sycl-docs.yml => gh_pages.yml} (100%)
 rename .github/workflows/{sycl-aws.yml => sycl_aws.yml} (100%)
 rename .github/workflows/{sycl-containers.yaml => sycl_containers.yaml} (100%)
 rename .github/workflows/{sycl-detect-changes.yml => sycl_detect_changes.yml} (100%)
 rename .github/workflows/{sycl-linux-build.yml => sycl_linux_build.yml} (100%)
 rename .github/workflows/{sycl-linux-matrix-e2e-on-nightly.yml => sycl_linux_matrix_e2e_on_nightly.yml} (100%)
 rename .github/workflows/{sycl-linux-precommit.yml => sycl_linux_precommit.yml} (100%)
 rename .github/workflows/{sycl-linux-precommit-aws.yml => sycl_linux_precommit_aws.yml} (100%)
 rename .github/workflows/{sycl-linux-run-tests.yml => sycl_linux_run_tests.yml} (100%)
 rename .github/workflows/{sycl-macos-build-and-test.yml => sycl_macos_build_and_test.yml} (100%)
 rename .github/workflows/{sycl-nightly.yml => sycl_nightly.yml} (100%)
 rename .github/workflows/{sycl-post-commit.yml => sycl_post_commit.yml} (100%)
 rename .github/workflows/{sycl-stale-issues.yml => sycl_stale_issues.yml} (100%)
 rename .github/workflows/{sycl-update-gpu-driver.yml => sycl_update_gpu_driver.yml} (100%)
 rename .github/workflows/{sycl-windows-build.yml => sycl_windows_build.yml} (100%)
 rename .github/workflows/{sycl-windows-precommit.yml => sycl_windows_precommit.yml} (100%)
 rename .github/workflows/{sycl-windows-run-tests.yml => sycl_windows_run_tests.yml} (100%)
 rename .github/workflows/{sycl-sync-main.yml => sync-main.yml} (100%)

diff --git a/.github/workflows/sycl-docs.yml b/.github/workflows/gh_pages.yml
similarity index 100%
rename from .github/workflows/sycl-docs.yml
rename to .github/workflows/gh_pages.yml
diff --git a/.github/workflows/sycl-aws.yml b/.github/workflows/sycl_aws.yml
similarity index 100%
rename from .github/workflows/sycl-aws.yml
rename to .github/workflows/sycl_aws.yml
diff --git a/.github/workflows/sycl-containers.yaml b/.github/workflows/sycl_containers.yaml
similarity index 100%
rename from .github/workflows/sycl-containers.yaml
rename to .github/workflows/sycl_containers.yaml
diff --git a/.github/workflows/sycl-detect-changes.yml b/.github/workflows/sycl_detect_changes.yml
similarity index 100%
rename from .github/workflows/sycl-detect-changes.yml
rename to .github/workflows/sycl_detect_changes.yml
diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl_linux_build.yml
similarity index 100%
rename from .github/workflows/sycl-linux-build.yml
rename to .github/workflows/sycl_linux_build.yml
diff --git a/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml b/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
similarity index 100%
rename from .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
rename to .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl_linux_precommit.yml
similarity index 100%
rename from .github/workflows/sycl-linux-precommit.yml
rename to .github/workflows/sycl_linux_precommit.yml
diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl_linux_precommit_aws.yml
similarity index 100%
rename from .github/workflows/sycl-linux-precommit-aws.yml
rename to .github/workflows/sycl_linux_precommit_aws.yml
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl_linux_run_tests.yml
similarity index 100%
rename from .github/workflows/sycl-linux-run-tests.yml
rename to .github/workflows/sycl_linux_run_tests.yml
diff --git a/.github/workflows/sycl-macos-build-and-test.yml b/.github/workflows/sycl_macos_build_and_test.yml
similarity index 100%
rename from .github/workflows/sycl-macos-build-and-test.yml
rename to .github/workflows/sycl_macos_build_and_test.yml
diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl_nightly.yml
similarity index 100%
rename from .github/workflows/sycl-nightly.yml
rename to .github/workflows/sycl_nightly.yml
diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl_post_commit.yml
similarity index 100%
rename from .github/workflows/sycl-post-commit.yml
rename to .github/workflows/sycl_post_commit.yml
diff --git a/.github/workflows/sycl-stale-issues.yml b/.github/workflows/sycl_stale_issues.yml
similarity index 100%
rename from .github/workflows/sycl-stale-issues.yml
rename to .github/workflows/sycl_stale_issues.yml
diff --git a/.github/workflows/sycl-update-gpu-driver.yml b/.github/workflows/sycl_update_gpu_driver.yml
similarity index 100%
rename from .github/workflows/sycl-update-gpu-driver.yml
rename to .github/workflows/sycl_update_gpu_driver.yml
diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl_windows_build.yml
similarity index 100%
rename from .github/workflows/sycl-windows-build.yml
rename to .github/workflows/sycl_windows_build.yml
diff --git a/.github/workflows/sycl-windows-precommit.yml b/.github/workflows/sycl_windows_precommit.yml
similarity index 100%
rename from .github/workflows/sycl-windows-precommit.yml
rename to .github/workflows/sycl_windows_precommit.yml
diff --git a/.github/workflows/sycl-windows-run-tests.yml b/.github/workflows/sycl_windows_run_tests.yml
similarity index 100%
rename from .github/workflows/sycl-windows-run-tests.yml
rename to .github/workflows/sycl_windows_run_tests.yml
diff --git a/.github/workflows/sycl-sync-main.yml b/.github/workflows/sync-main.yml
similarity index 100%
rename from .github/workflows/sycl-sync-main.yml
rename to .github/workflows/sync-main.yml

From 7348207426ef239650af8e3a4b1ebf8a088f9c51 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 31 Jan 2024 19:28:39 +0000
Subject: [PATCH 04/14] [SYCL][ESIMD][E2E] Disable two LSC tests on DG2
 (#12565)

They started failing in the recent driver update. I can't reproduce it
locally with the same driver version but the hardware we have is a
little different, maybe that's why. I made an internal tracker for this.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp | 2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
index 77f6a6c4122d9..83bb92eca5440 100644
--- a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // This test checks local accessor cmpxchg atomic operations.
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
index 25bce1b2b2244..4fde6446bdff1 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC SLM atomic operations.
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 

From b478d2fac950e88c0049844418b5daedc1c9df89 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 31 Jan 2024 12:08:09 -0800
Subject: [PATCH 05/14] [SYCL] Fix resource leak related to
 SYCL_FALLBACK_ASSERT (#12532)

https://github.com/intel/llvm/pull/6837 enabled asynchronous buffer
destruction for buffers constructed without host data. However, initial
fallback assert implementation in
https://github.com/intel/llvm/pull/3767 predates it and as such had to
place the buffer inside `queue_impl` to avoid unintended synchronization
point. I don't know if there was the same crash observed on the
end-to-end test added as part of this PR prior to
https://github.com/intel/llvm/pull/3767, but it doesn't even matter
because the "new" implementation is both simpler and doesn't result in a
crash.

I suspect that without it (with the buffer for fallback assert
implementation being a data member of `sycl::queue_impl`) we had a
cyclic dependency somewhere leading to resource leak and ultimately to
the assert in `DeviceGlobalUSMMem::~DeviceGlobalUSMMem()`.
---
 sycl/include/sycl/queue.hpp                  |  6 ++--
 sycl/source/detail/queue_impl.hpp            | 13 +++++++-
 sycl/source/queue.cpp                        |  2 ++
 sycl/test-e2e/Assert/check_resource_leak.cpp | 32 ++++++++++++++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 sycl/test-e2e/Assert/check_resource_leak.cpp

diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 4706639127022..b693cbaae62c8 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -2965,7 +2965,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
         Rest...);
   }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   buffer<detail::AssertHappened, 1> &getAssertHappenedBuffer();
+#endif
 
   event memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
                              bool IsDeviceImageScope, size_t NumBytes,
@@ -3019,9 +3021,7 @@ class AssertInfoCopier;
  */
 event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
                           const detail::code_location &CodeLoc) {
-  using AHBufT = buffer<detail::AssertHappened, 1>;
-
-  AHBufT &Buffer = Self.getAssertHappenedBuffer();
+  buffer<detail::AssertHappened, 1> Buffer{1};
 
   event CopierEv, CheckerEv, PostCheckerEv;
   auto CopierCGF = [&](handler &CGH) {
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 810f991d6667f..ddd6a71d7db80 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -108,7 +108,9 @@ class queue_impl {
              const async_handler &AsyncHandler, const property_list &PropList)
       : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
         MPropList(PropList), MHostQueue(MDevice->is_host()),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
         MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -283,7 +285,9 @@ class queue_impl {
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler)
       : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
         MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -305,7 +309,10 @@ class queue_impl {
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler, const property_list &PropList)
       : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
-        MHostQueue(false), MAssertHappenedBuffer(range<1>{1}),
+        MHostQueue(false),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+        MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -673,9 +680,11 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative(int32_t &NativeHandleDesc) const;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   buffer<AssertHappened, 1> &getAssertHappenedBuffer() {
     return MAssertHappenedBuffer;
   }
+#endif
 
   void registerStreamServiceEvent(const EventImplPtr &Event) {
     std::lock_guard<std::mutex> Lock(MMutex);
@@ -918,8 +927,10 @@ class queue_impl {
   /// need to emulate it with multiple native in-order queues.
   bool MEmulateOOO = false;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   // Buffer to store assert failure descriptor
   buffer<AssertHappened, 1> MAssertHappenedBuffer;
+#endif
 
   // This event is employed for enhanced dependency tracking with in-order queue
   // Access to the event should be guarded with MMutex
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 47b6b29b89bab..1b877a31da4e0 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -298,9 +298,11 @@ pi_native_handle queue::getNative(int32_t &NativeHandleDesc) const {
   return impl->getNative(NativeHandleDesc);
 }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 buffer<detail::AssertHappened, 1> &queue::getAssertHappenedBuffer() {
   return impl->getAssertHappenedBuffer();
 }
+#endif
 
 event queue::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
                                   bool IsDeviceImageScope, size_t NumBytes,
diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp
new file mode 100644
index 0000000000000..252d2ed9e0c49
--- /dev/null
+++ b/sycl/test-e2e/Assert/check_resource_leak.cpp
@@ -0,0 +1,32 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Device globals aren't supported on opencl:gpu yet.
+// UNSUPPORTED: opencl && gpu
+
+// TODO: Fails at JIT compilation for some reason.
+// UNSUPPORTED: hip
+#define SYCL_FALLBACK_ASSERT 1
+
+#include <sycl/sycl.hpp>
+
+// DeviceGlobalUSMMem::~DeviceGlobalUSMMem() has asserts to ensure some
+// resources have been cleaned up when it's executed. Those asserts used to fail
+// when "AssertHappened" buffer used in fallback implementation of the device
+// assert was a data member of the queue_impl.
+sycl::ext::oneapi::experimental::device_global<int32_t> dg;
+
+int main() {
+  sycl::queue q;
+  q.submit([&](sycl::handler &cgh) {
+     sycl::range<1> R{16};
+     cgh.parallel_for(sycl::nd_range<1>{R, R}, [=](sycl::nd_item<1> ndi) {
+       if (ndi.get_global_linear_id() == 0)
+         dg.get() = 42;
+       auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+       auto active = sycl::ext::oneapi::group_ballot(sg, 1);
+     });
+   }).wait();
+
+  return 0;
+}

From 85e461e5688dae634b3c2f1026f525cffd0182b3 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 31 Jan 2024 12:23:27 -0800
Subject: [PATCH 06/14] [SYCL][E2E] Disable USM/usm_pooling.cpp on
 gpu-intel-dg2 (#12564)

See https://github.com/intel/llvm/issues/12397, the test is flaky in
post-commit.
---
 sycl/test-e2e/USM/usm_pooling.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp
index 4a9d16ec5a34e..2f2d4009dce1e 100644
--- a/sycl/test-e2e/USM/usm_pooling.cpp
+++ b/sycl/test-e2e/USM/usm_pooling.cpp
@@ -1,6 +1,9 @@
 // REQUIRES: level_zero
 // RUN: %{build} -o %t.out
 
+// https://github.com/intel/llvm/issues/12397
+// UNSUPPORTED: gpu-intel-dg2
+
 // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.
 
 // With no pooling: 1,2,3,4,5 allocs lead to ZE call.

From 435845bc23f68ad146915018aa5dd5b451d8bc5e Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 31 Jan 2024 15:13:43 -0800
Subject: [PATCH 07/14] [CI][NFC] Unify naming scheme for SYCL workflows.
 (#12568)

All GitHub Actions workflows added by intel/llvm project are expected to
use following naming notation:

1. Name starts with `sycl` prefix.
2. Use dash `-` to separate words (instead of underscore `_`).

This patches fixes naming of workflows which do not follow this
notation.
---
 .../workflows/{sycl_aws.yml => sycl-aws.yml}   |  0
 ...cl_containers.yaml => sycl-containers.yaml} |  4 ++--
 ...ect_changes.yml => sycl-detect-changes.yml} |  0
 .../workflows/{gh_pages.yml => sycl-docs.yml}  |  2 +-
 ...cl_linux_build.yml => sycl-linux-build.yml} |  0
 ...ml => sycl-linux-matrix-e2e-on-nightly.yml} |  8 ++++----
 ...it_aws.yml => sycl-linux-precommit-aws.yml} |  2 +-
 ..._precommit.yml => sycl-linux-precommit.yml} | 12 ++++++------
 ..._run_tests.yml => sycl-linux-run-tests.yml} |  0
 ..._test.yml => sycl-macos-build-and-test.yml} |  0
 .../{sycl_nightly.yml => sycl-nightly.yml}     |  8 ++++----
 ...cl_post_commit.yml => sycl-post-commit.yml} | 18 +++++++++---------
 ..._stale_issues.yml => sycl-stale-issues.yml} |  0
 .../{sync-main.yml => sycl-sync-main.yml}      |  0
 ...u_driver.yml => sycl-update-gpu-driver.yml} |  0
 ...indows_build.yml => sycl-windows-build.yml} |  0
 ...recommit.yml => sycl-windows-precommit.yml} | 12 ++++++------
 ...un_tests.yml => sycl-windows-run-tests.yml} |  0
 README.md                                      |  4 ++--
 19 files changed, 35 insertions(+), 35 deletions(-)
 rename .github/workflows/{sycl_aws.yml => sycl-aws.yml} (100%)
 rename .github/workflows/{sycl_containers.yaml => sycl-containers.yaml} (98%)
 rename .github/workflows/{sycl_detect_changes.yml => sycl-detect-changes.yml} (100%)
 rename .github/workflows/{gh_pages.yml => sycl-docs.yml} (97%)
 rename .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml} (100%)
 rename .github/workflows/{sycl_linux_matrix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} (94%)
 rename .github/workflows/{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} (98%)
 rename .github/workflows/{sycl_linux_precommit.yml => sycl-linux-precommit.yml} (94%)
 rename .github/workflows/{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml} (100%)
 rename .github/workflows/{sycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} (100%)
 rename .github/workflows/{sycl_nightly.yml => sycl-nightly.yml} (96%)
 rename .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml} (88%)
 rename .github/workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml} (100%)
 rename .github/workflows/{sync-main.yml => sycl-sync-main.yml} (100%)
 rename .github/workflows/{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml} (100%)
 rename .github/workflows/{sycl_windows_build.yml => sycl-windows-build.yml} (100%)
 rename .github/workflows/{sycl_windows_precommit.yml => sycl-windows-precommit.yml} (80%)
 rename .github/workflows/{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml} (100%)

diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml
similarity index 100%
rename from .github/workflows/sycl_aws.yml
rename to .github/workflows/sycl-aws.yml
diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml
similarity index 98%
rename from .github/workflows/sycl_containers.yaml
rename to .github/workflows/sycl-containers.yaml
index 011d45ee9fee5..bb2e086929847 100644
--- a/.github/workflows/sycl_containers.yaml
+++ b/.github/workflows/sycl-containers.yaml
@@ -12,14 +12,14 @@ on:
       - 'devops/dependencies.json'
       - 'devops/scripts/install_drivers.sh'
       - 'devops/scripts/install_build_tools.sh'
-      - '.github/workflows/sycl_containers.yaml'
+      - '.github/workflows/sycl-containers.yaml'
   pull_request:
     paths:
       - 'devops/containers/**'
       - 'devops/dependencies.json'
       - 'devops/scripts/install_drivers.sh'
       - 'devops/scripts/install_build_tools.sh'
-      - '.github/workflows/sycl_containers.yaml'
+      - '.github/workflows/sycl-containers.yaml'
 
 jobs:
   base_image_ubuntu2204:
diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml
similarity index 100%
rename from .github/workflows/sycl_detect_changes.yml
rename to .github/workflows/sycl-detect-changes.yml
diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml
similarity index 97%
rename from .github/workflows/gh_pages.yml
rename to .github/workflows/sycl-docs.yml
index d0d4a130d6f78..73062642535cf 100644
--- a/.github/workflows/gh_pages.yml
+++ b/.github/workflows/sycl-docs.yml
@@ -7,7 +7,7 @@ on:
     branches:
     - sycl
     paths:
-    - '.github/workflows/gh_pages.yml'
+    - '.github/workflows/sycl-docs.yml'
     - 'clang/docs/**'
     - 'sycl/doc/**'
 
diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml
similarity index 100%
rename from .github/workflows/sycl_linux_build.yml
rename to .github/workflows/sycl-linux-build.yml
diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
similarity index 94%
rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
index 50dc1ca700035..06a5f13674768 100644
--- a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
+++ b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
@@ -48,7 +48,7 @@ jobs:
             image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build
             image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
             target_devices: ext_oneapi_cuda:gpu
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -62,7 +62,7 @@ jobs:
 
   aws_start:
     name: AWS Start
-    uses: ./.github/workflows/sycl_aws.yml
+    uses: ./.github/workflows/sycl-aws.yml
     secrets: inherit
     with:
       mode: start
@@ -71,7 +71,7 @@ jobs:
   linux_e2e_on_nightly_aws:
     name: '[AWS][CUDA] E2E on Nightly'
     needs: [aws_start]
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: CUDA
       runner: '["aws-cuda_${{ github.run_id }}-${{ github.run_attempt }}"]'
@@ -86,7 +86,7 @@ jobs:
     name: AWS Stop
     needs: [aws_start, linux_e2e_on_nightly_aws]
     if: always()
-    uses: ./.github/workflows/sycl_aws.yml
+    uses: ./.github/workflows/sycl-aws.yml
     secrets: inherit
     with:
       mode: stop
diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
similarity index 98%
rename from .github/workflows/sycl_linux_precommit_aws.yml
rename to .github/workflows/sycl-linux-precommit-aws.yml
index 3fce40107d90e..8ff68e725e447 100644
--- a/.github/workflows/sycl_linux_precommit_aws.yml
+++ b/.github/workflows/sycl-linux-precommit-aws.yml
@@ -54,7 +54,7 @@ jobs:
 
   e2e-cuda:
     needs: [aws-start]
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]'
diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml
similarity index 94%
rename from .github/workflows/sycl_linux_precommit.yml
rename to .github/workflows/sycl-linux-precommit.yml
index 8df5237d63d04..eb3f30aebcfe6 100644
--- a/.github/workflows/sycl_linux_precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -18,8 +18,8 @@ on:
     - 'clang/docs/**'
     - '**.md'
     - '**.rst'
-    - '.github/workflows/sycl_windows_*.yml'
-    - '.github/workflows/sycl_macos_*.yml'
+    - '.github/workflows/sycl-windows-*.yml'
+    - '.github/workflows/sycl-macos-*.yml'
     - 'devops/containers/**'
     - 'devops/actions/build_container/**'
 
@@ -30,12 +30,12 @@ concurrency:
 
 jobs:
   detect_changes:
-    uses: ./.github/workflows/sycl_detect_changes.yml
+    uses: ./.github/workflows/sycl-detect-changes.yml
 
   build:
     needs: [detect_changes]
     if: always() && success()
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     with:
       build_ref: ${{ github.sha }}
       merge_ref: ''
@@ -75,7 +75,7 @@ jobs:
             install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
             extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True
             env: '{"LIT_FILTER":"Matrix/"}'
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -111,7 +111,7 @@ jobs:
             runner: '["Linux", "gen12"]'
           - name: Perf tests on Intel Arc A-Series Graphics system
             runner: '["Linux", "arc"]'
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_linux_run_tests.yml
rename to .github/workflows/sycl-linux-run-tests.yml
diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml
similarity index 100%
rename from .github/workflows/sycl_macos_build_and_test.yml
rename to .github/workflows/sycl-macos-build-and-test.yml
diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml
similarity index 96%
rename from .github/workflows/sycl_nightly.yml
rename to .github/workflows/sycl-nightly.yml
index e802af89a29b4..28df4a8b8de13 100644
--- a/.github/workflows/sycl_nightly.yml
+++ b/.github/workflows/sycl-nightly.yml
@@ -8,7 +8,7 @@ on:
 jobs:
   ubuntu2204_build:
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     secrets: inherit
     with:
       build_cache_root: "/__w/"
@@ -59,7 +59,7 @@ jobs:
             image: ghcr.io/intel/llvm/ubuntu2204_build:latest
             image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
             target_devices: ext_oneapi_cuda:gpu
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix.runner }}
@@ -74,7 +74,7 @@ jobs:
       sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
 
   build-win:
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
     if: github.repository == 'intel/llvm'
     with:
       retention-days: 90
@@ -89,7 +89,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml
similarity index 88%
rename from .github/workflows/sycl_post_commit.yml
rename to .github/workflows/sycl-post-commit.yml
index 76dd3ac88e3d2..00205a52b03e1 100644
--- a/.github/workflows/sycl_post_commit.yml
+++ b/.github/workflows/sycl-post-commit.yml
@@ -12,10 +12,10 @@ on:
     - sycl
     - sycl-devops-pr/**
     paths:
-    - .github/workflows/sycl_post_commit.yml
-    - .github/workflows/sycl_linux_build.yml
-    - .github/workflows/sycl_linux_run_tests.yml
-    - .github/workflows/sycl_macos_build_and_test.yml
+    - .github/workflows/sycl-post-commit.yml
+    - .github/workflows/sycl-linux-build.yml
+    - .github/workflows/sycl-linux-run-tests.yml
+    - .github/workflows/sycl-macos-build-and-test.yml
     - ./devops/actions/cleanup
     - ./devops/actions/cached_checkout
 
@@ -23,7 +23,7 @@ jobs:
   build-lin:
     name: Linux (Self build + shared libraries + no-assertions)
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     with:
       build_cache_root: "/__w/llvm"
       build_cache_suffix: sprod_shared
@@ -63,7 +63,7 @@ jobs:
             env: '{"LIT_FILTER":"PerformanceTests/"}'
             extra_lit_opts: -a -j 1 --param enable-perf-tests=True
             target_devices: all
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -87,7 +87,7 @@ jobs:
       always()
       && success()
       && github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
 
   e2e-win:
     needs: build-win
@@ -96,7 +96,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build-win.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
@@ -105,4 +105,4 @@ jobs:
   macos_default:
     name: macOS
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_macos_build_and_test.yml
+    uses: ./.github/workflows/sycl-macos-build-and-test.yml
diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml
similarity index 100%
rename from .github/workflows/sycl_stale_issues.yml
rename to .github/workflows/sycl-stale-issues.yml
diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml
similarity index 100%
rename from .github/workflows/sync-main.yml
rename to .github/workflows/sycl-sync-main.yml
diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml
similarity index 100%
rename from .github/workflows/sycl_update_gpu_driver.yml
rename to .github/workflows/sycl-update-gpu-driver.yml
diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml
similarity index 100%
rename from .github/workflows/sycl_windows_build.yml
rename to .github/workflows/sycl-windows-build.yml
diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml
similarity index 80%
rename from .github/workflows/sycl_windows_precommit.yml
rename to .github/workflows/sycl-windows-precommit.yml
index 2b7883895d97c..3b96b463560a9 100644
--- a/.github/workflows/sycl_windows_precommit.yml
+++ b/.github/workflows/sycl-windows-precommit.yml
@@ -16,9 +16,9 @@ on:
     - 'clang/docs/**'
     - '**.md'
     - '**.rst'
-    - '.github/workflows/sycl_linux_*.yml'
-    - '.github/workflows/sycl_precommit_aws.yml'
-    - '.github/workflows/sycl_macos_*.yml'
+    - '.github/workflows/sycl-linux-*.yml'
+    - '.github/workflows/sycl-precommit-aws.yml'
+    - '.github/workflows/sycl-macos-*.yml'
     - 'devops/containers/**'
     - 'devops/actions/build_container/**'
 
@@ -32,14 +32,14 @@ concurrency:
 
 jobs:
   detect_changes:
-    uses: ./.github/workflows/sycl_detect_changes.yml
+    uses: ./.github/workflows/sycl-detect-changes.yml
 
   build:
     needs: [detect_changes]
     if: |
       always() && success()
       && github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
     with:
       changes: ${{ needs.detect_changes.outputs.filters }}
 
@@ -50,7 +50,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_windows_run_tests.yml
rename to .github/workflows/sycl-windows-run-tests.yml
diff --git a/README.md b/README.md
index bf72abab3e686..8ab5cda1621bb 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ For general contribution process see [CONTRIBUTING.md](./CONTRIBUTING.md)
 
 [![oneAPI logo](https://spec.oneapi.io/oneapi-logo-white-scaled.jpg)](https://www.oneapi.io/)
 
-[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml)
-[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/gh_pages.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/gh_pages.yml)
+[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml)
+[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml)
 
 The DPC++ is a LLVM-based compiler project that implements compiler and runtime
 support for the SYCL\* language. The project is hosted in the

From af448b06107a41b409987b36a7d1bd6f6bb50411 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Thu, 1 Feb 2024 12:43:49 +0100
Subject: [PATCH 08/14] [SYCL][Fusion] Handle fusion leading to synchronization
 issues  (#12538)

Do not allow fusion when one of the kernels has an explicit local size
and it requires ID remapping, i.e., it has a different number of
dimensions w.r.t. the fused ND-range or different global size in
dimensions [2, N). In this case, two work-items belonging to the same
work-group may not belong to the same work-group in the fused ND-range.

Signed-off-by: Victor Perez <victor.perez@codeplay.com>

---------

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
---
 sycl-fusion/common/lib/NDRangesHelper.cpp   | 12 ++++++++
 sycl/doc/design/KernelFusionJIT.md          |  3 +-
 sycl/test-e2e/KernelFusion/abort_fusion.cpp | 33 +++++++++++++++++----
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/sycl-fusion/common/lib/NDRangesHelper.cpp b/sycl-fusion/common/lib/NDRangesHelper.cpp
index 96f26d96a4ea5..7c418022ef4ee 100644
--- a/sycl-fusion/common/lib/NDRangesHelper.cpp
+++ b/sycl-fusion/common/lib/NDRangesHelper.cpp
@@ -170,6 +170,18 @@ jit_compiler::FusedNDRange::get(ArrayRef<NDRange> NDRanges) {
                                "Cannot fuse kernels whose fusion would "
                                "yield non-uniform work-group sizes");
     }
+
+    // Work-items in the same work-group in the original ND-ranges must be in
+    // the same work-group in the fused one.
+    if (LocalSize && any_of(NDRanges, [&Fused](const NDRange &NDR) {
+          return NDR.hasSpecificLocalSize() && requireIDRemapping(Fused, NDR);
+        })) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Cannot fuse kernels when any of the fused kernels with a specific "
+          "local size has different global sizes in dimensions [2, N) or "
+          "different number of dimensions");
+    }
   }
 
   return FusedNDRange{Fused, IsHeterogeneousList, NDRanges};
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index b83a8a26eeae0..224a1984d2902 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -212,7 +212,8 @@ These restrictions can be simplified to:
 
 - No two local sizes specified by the nd-ranges will be different;
 - No global id remapping is needed ([see](#work-item-remapping)) or all input offsets are 0;
-- All the fused nd-ranges must have the same offset.
+- All the fused nd-ranges must have the same offset;
+- No global id remapping is needed for kernels specifying a local size.
 
 As we can see, there is no restrictions in the number of dimensions or global sizes of the input nd-ranges.
 
diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
index 709befa514915..930fcc12eff86 100644
--- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
@@ -15,14 +15,19 @@ enum class Internalization { None, Local, Private };
 
 template <typename Range> size_t getSize(Range r);
 
-template <> size_t getSize(range<1> r) { return r.size(); }
-template <> size_t getSize(nd_range<1> r) {
+template <int Dimensions> size_t getSize(range<Dimensions> r) {
+  return r.size();
+}
+template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
   return r.get_global_range().size();
 }
 
 template <typename Kernel1Name, typename Kernel2Name, typename Range1,
           typename Range2>
 void performFusion(queue &q, Range1 R1, Range2 R2) {
+  using IndexTy1 = item<Range1::dimensions>;
+  using IndexTy2 = item<Range2::dimensions>;
+
   int in[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -43,15 +48,19 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
     q.submit([&](handler &cgh) {
       auto accIn = bIn.get_access(cgh);
       auto accTmp = bTmp.get_access(cgh);
-      cgh.parallel_for<Kernel1Name>(
-          R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; });
+      cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
+        size_t j = i.get_linear_id();
+        accTmp[j] = accIn[j] + 5;
+      });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp = bTmp.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<Kernel2Name>(
-          R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
+        size_t j = i.get_linear_id();
+        accOut[j] = accTmp[j] * 2;
+      });
     });
 
     fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
@@ -117,5 +126,17 @@ int main() {
   // CHECK-NEXT: Cannot fuse kernels whose fusion would yield non-uniform work-group sizes
   // CHECK: COMPUTATION OK
 
+  // Scenario: Fusing two kernels that may lead to synchronization issues as two
+  // work-items in the same work-group may not be in the same work-group in the
+  // fused ND-range.
+  performFusion<class Kernel1_5, class Kernel2_5>(
+      q, nd_range<2>{range<2>{2, 2}, range<2>{2, 2}},
+      nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
+  // CHECK: ERROR: JIT compilation for kernel fusion failed with message:
+  // CHECK-NEXT: Illegal ND-range combination
+  // CHECK-NEXT: Detailed information:
+  // CHECK-NEXT: Cannot fuse kernels when any of the fused kernels with a specific local size has different global sizes in dimensions [2, N) or different number of dimensions
+  // CHECK: COMPUTATION OK
+
   return 0;
 }

From e402523e67e9bb91ec44d714ab4d57a9ecf2effe Mon Sep 17 00:00:00 2001
From: Pierre-Andre Saulais <pierre-andre@codeplay.com>
Date: Thu, 1 Feb 2024 12:33:29 +0000
Subject: [PATCH 09/14] [UR][CUDA] Use new variant of the enableCUDATracing
 function (#12521)

https://github.com/oneapi-src/unified-runtime/pull/1070 and
https://github.com/intel/llvm/pull/11952 introduced a new variant of the
`enableCUDATracing` function that takes a context pointer parameter,
replacing the parameterless variant of that function. The older variant
will be removed from UR once this PR is merged.
---
 sycl/plugins/cuda/pi_cuda.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 4b8163b03efbd..6d50d120aa2f9 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -17,7 +17,12 @@
 #include <sycl/detail/pi.hpp>
 
 // Forward declarations
-void enableCUDATracing();
+struct cuda_tracing_context_t_;
+
+void enableCUDATracing(cuda_tracing_context_t_ *ctx);
+void disableCUDATracing(cuda_tracing_context_t_ *ctx);
+cuda_tracing_context_t_ *createCUDATracingContext();
+void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx);
 
 //-- PI API implementation
 extern "C" {
@@ -1237,7 +1242,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   std::memset(&(PluginInit->PiFunctionTable), 0,
               sizeof(PluginInit->PiFunctionTable));
 
-  enableCUDATracing();
+  cuda_tracing_context_t_ *Ctx = createCUDATracingContext();
+  enableCUDATracing(Ctx);
 
 // Forward calls to CUDA RT.
 #define _PI_API(api)                                                           \

From f9e4f10662c15fdd5d77313b9fb1ca8614976c2c Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 1 Feb 2024 12:35:25 +0000
Subject: [PATCH 10/14] [SYCL][CUDA] Improved joint_matrix layout test
 coverage. (#12483)

Improved joint_matrix layout test coverage.

The test framework that the cuda backend tests use has been updated to
support all possible `joint_matrix` gemm API combinations, including all
matrix layouts. the gemm header is backend agnostic; hence all backends
could use this test framework in the future.

This test framework can also act as an example to show how to deal with
different layout combinations when computing a general GEMM.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../Matrix/joint_matrix_gemm_cuda.hpp         | 105 ++++++++++++------
 .../Matrix/joint_matrix_tensorcores_sm70.cpp  |  15 ++-
 .../Matrix/joint_matrix_tensorcores_sm72.cpp  |  19 +++-
 .../Matrix/joint_matrix_tensorcores_sm80.cpp  |  22 +++-
 4 files changed, 118 insertions(+), 43 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
index fe5b110864e6b..9fd4f184692be 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
@@ -11,7 +11,7 @@ using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 constexpr float bf16_eps = 0.00390625;
 
-// Example usage of Nvidia matrix multiply.
+// Example usage of joint_matrix matrix multiply.
 // Optimizations such as memory paddings for avoiding bank conflicts are not
 // included in this test which aids clarity for what is going on. This example
 // forms a "Big matrix" corresponding to a single "TILE" using cuda example
@@ -30,37 +30,47 @@ constexpr float bf16_eps = 0.00390625;
 constexpr int N_THREADS_PER_MATRIX_OP = 32;
 
 // number of submatrices per row of accumulator ("C", "D") matrices.
-constexpr int SUB_TILES_M = 3;
+constexpr int SUB_TILES_M = 2;
 // number of submatrices per col of accumulator matrices.
 constexpr int SUB_TILES_N = 2;
 // number of submatrices per col of "A"/per row of "B", matrices.
-constexpr int SUB_TILES_K = 1;
+constexpr int SUB_TILES_K = 2;
 
-template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N>
+template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N,
+          layout layout_A, layout layout_B, layout layout_C>
 class TypeHelper;
 
-template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N>
-using KernelName = class TypeHelper<Tm, Tc, Td, M, K, N>;
+template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N,
+          layout layout_A, layout layout_B, layout layout_C>
+using KernelName =
+    class TypeHelper<Tm, Tc, Td, M, K, N, layout_A, layout_B, layout_C>;
 
-template <size_t Big_N, size_t Big_K, typename Tm, typename Tc>
+template <size_t Big_N, size_t Big_K, size_t Big_M, layout layout_A,
+          layout layout_B, typename Tm, typename Tc>
 Tc matrix_ref_mn(const int &m, const int &n, Tm *A, Tm *B, Tc *C) {
   Tc res = C[m * Big_N + n];
 
-  if constexpr (std::is_same<Tm, bfloat16>::value) {
-    for (int k = 0; k < Big_K; k++)
-      res += A[m * Big_K + k] * B[k * Big_N + n];
-  } else {
-    for (int k = 0; k < Big_K; k++)
-      res +=
-          static_cast<Tc>(A[m * Big_K + k]) * static_cast<Tc>(B[k * Big_N + n]);
+  for (int k = 0; k < Big_K; k++) {
+    auto index_a =
+        layout_A == layout::row_major ? m * Big_K + k : m + k * Big_M;
+    auto index_b =
+        layout_B == layout::row_major ? k * Big_N + n : k + n * Big_K;
+
+    if constexpr (std::is_same<Tm, bfloat16>::value) {
+      res += A[index_a] * B[index_b];
+    } else {
+      res += static_cast<Tc>(A[index_a]) * static_cast<Tc>(B[index_b]);
+    }
   }
 
   return res;
 }
 
-template <typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M,
-          size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N,
-          typename T3 = std::remove_const_t<Tm>>
+template <
+    typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M,
+    size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N,
+    layout layout_A = layout::row_major, layout layout_B = layout::row_major,
+    layout layout_C = layout::row_major, typename T3 = std::remove_const_t<Tm>>
 void test(queue &q) {
   // total number of M dimension matrix elements for the "Big matrix".
   constexpr auto Big_M = Sub_Tiles_M * M;
@@ -97,7 +107,8 @@ void test(queue &q) {
         accessor<bfloat16, 1, access::mode::write, target::device> accA(bufA,
                                                                         cgh);
 
-        cgh.parallel_for<KernelName<Tm, Tc, class copyA, M, K, N>>(
+        cgh.parallel_for<KernelName<Tm, Tc, class copyA, M, K, N, layout_A,
+                                    layout_B, layout_C>>(
             range<1>(Big_M * Big_K), [=](item<1> item) {
               auto i = item.get_linear_id();
               accA[i] = 0.1f * (i % 10);
@@ -107,7 +118,8 @@ void test(queue &q) {
         accessor<bfloat16, 1, access::mode::write, target::device> accB(bufB,
                                                                         cgh);
 
-        cgh.parallel_for<KernelName<Tm, Tc, class copyB, M, K, N>>(
+        cgh.parallel_for<KernelName<Tm, Tc, class copyB, M, K, N, layout_A,
+                                    layout_B, layout_C>>(
             range<1>(Big_K * Big_N), [=](item<1> item) {
               auto i = item.get_linear_id();
               accB[i] = 0.1f * (i % 10);
@@ -130,7 +142,8 @@ void test(queue &q) {
       range<2> GlobalRange = {Sub_Tiles_M,
                               Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
 
-      cgh.parallel_for<KernelName<Tm, Tc, Td, M, K, N>>(
+      cgh.parallel_for<
+          KernelName<Tm, Tc, Td, M, K, N, layout_A, layout_B, layout_C>>(
           nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
             sycl::sub_group sg = item.get_sub_group();
             // row id of current submatrix of BIG C matrix
@@ -138,33 +151,46 @@ void test(queue &q) {
             // column id of current submatrix of BIG C matrix
             const auto n = item.get_group().get_group_id()[1];
 
-            joint_matrix<sycl::sub_group, T3, use::a, M, K, layout::row_major>
-                sub_a;
-            joint_matrix<sycl::sub_group, T3, use::b, K, N, layout::row_major>
-                sub_b;
+            joint_matrix<sycl::sub_group, T3, use::a, M, K, layout_A> sub_a;
+            joint_matrix<sycl::sub_group, T3, use::b, K, N, layout_B> sub_b;
             joint_matrix<sycl::sub_group, std::remove_const_t<Tc>,
                          use::accumulator, M, N>
                 sub_c;
             joint_matrix<sycl::sub_group, Td, use::accumulator, M, N> sub_d;
+            auto stride_C = layout_C == layout::row_major ? Big_N : Big_M;
+            auto load_stride_C = layout_C == layout::row_major
+                                     ? (m * M) * Big_N + n * N
+                                     : (m * M) + n * N * Big_M;
 
             joint_matrix_load(
                 sg, sub_c,
                 accC.template get_multi_ptr<access::decorated::no>() +
-                    (m * M) * Big_N + n * N,
-                Big_N, layout::row_major);
+                    load_stride_C,
+                stride_C, layout_C);
+
+            auto stride_A = layout_A == layout::row_major ? Big_K : Big_M;
+            auto stride_B = layout_B == layout::row_major ? Big_N : Big_K;
+
             // k = row/col id of current submatrix of BIG A/B matrices
             for (int k = 0; k < Sub_Tiles_K; k++) {
+              auto load_stride_A = layout_A == layout::row_major
+                                       ? (k * K) + (m * M * Big_K)
+                                       : (k * K * Big_M) + (m * M);
+              auto load_stride_B = layout_B == layout::row_major
+                                       ? (k * K * Big_N) + (n * N)
+                                       : (k * K) + (n * N * Big_K);
+
               joint_matrix_load(
                   sg, sub_a,
                   accA.template get_multi_ptr<access::decorated::no>() +
-                      (k * K) + (m * M * Big_K),
-                  Big_K);
+                      load_stride_A,
+                  stride_A);
 
               joint_matrix_load(
                   sg, sub_b,
                   accB.template get_multi_ptr<access::decorated::no>() +
-                      (k * K * Big_N) + (n * N),
-                  Big_N);
+                      load_stride_B,
+                  stride_B);
 
               // round values to correct precision if using tf32
               if constexpr (std::is_same<T3, precision::tf32>::value) {
@@ -174,12 +200,13 @@ void test(queue &q) {
               }
 
               joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
+              joint_matrix_copy(sg, sub_d, sub_c);
             }
             joint_matrix_store(
                 sg, sub_d,
                 accD.template get_multi_ptr<access::decorated::no>() +
-                    (m * M) * Big_N + n * N,
-                Big_N, layout::row_major);
+                    load_stride_C,
+                stride_C, layout_C);
           });
     });
     q.wait();
@@ -187,14 +214,18 @@ void test(queue &q) {
 
   for (int m = 0; m < Big_M; m++) {
     for (int n = 0; n < Big_N; n++) {
+      auto index_D =
+          layout_C == layout::row_major ? m * Big_N + n : m + n * Big_M;
       if constexpr (std::is_same<std::remove_const_t<Tm>, bfloat16>::value) {
-        auto res_device = matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C);
-        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
-                   (D[m * Big_N + n] + res_device) <
+        auto res_device =
+            matrix_ref_mn<Big_N, Big_K, Big_M, layout_A, layout_B>(m, n, A, B,
+                                                                   C);
+        assert(fabs(2 * (D[index_D] - res_device)) / (D[index_D] + res_device) <
                bf16_eps * 2);
       } else {
-        assert(
-            (D[m * Big_N + n] == matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C)));
+        assert((D[index_D] ==
+                matrix_ref_mn<Big_N, Big_K, Big_M, layout_A, layout_B>(m, n, A,
+                                                                       B, C)));
       }
     }
   }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
index f28372b6277dc..a558600ad390c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
@@ -80,12 +80,23 @@ int main() {
     test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
          32, 16, 8>(Q);
 
+    // test different layout combinations for one case
+
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::row_major, layout::row_major, layout::col_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::row_major, layout::col_major, layout::row_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::col_major, layout::row_major, layout::row_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::col_major, layout::col_major, layout::row_major>(Q);
+
+    // joint_matrix_apply tests
+
     auto apply_add = [](auto &x) { x = x + 2; };
     float D[MATRIX_M][MATRIX_N];
     big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
 
-    // joint_matrix_apply tests
-
     matrix_verify_lambda<half, float, M, 16, N>(Q, MD_f, 0.0, apply_add);
   }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
index cea15392408cc..1dea8c879b5eb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
@@ -50,13 +50,28 @@ int main() {
     test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
          SUB_TILES_N, 32, 16, 8>(Q);
 
+    // test different layout combinations for one case
+
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::row_major, layout::row_major,
+         layout::col_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::col_major, layout::row_major,
+         layout::row_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::row_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::col_major, layout::col_major,
+         layout::row_major>(Q);
+
+    // joint_matrix_apply tests
+
     auto apply_add = [](auto &x) { x = x + 2; };
 
     int32_t D_i[MATRIX_M][MATRIX_N];
     big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
 
-    // joint_matrix_apply tests
-
     matrix_verify_lambda<uint8_t, int32_t, M, 16, N>(Q, MD_i, 0, apply_add);
     matrix_verify_lambda<int8_t, int32_t, M, 16, N>(Q, MD_i, 0, apply_add);
   }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
index 2a0731d9b988e..ca823161b6197 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
@@ -43,9 +43,28 @@ int main() {
 
     // A/B tf32
     test<float, float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
+         layout::row_major, layout::row_major, layout::row_major,
          precision::tf32>(Q);
     test<const float, const float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
-         16, 8, 16, precision::tf32>(Q);
+         16, 8, 16, layout::row_major, layout::row_major, layout::row_major,
+         precision::tf32>(Q);
+
+    // test different layout combinations for one case
+
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::row_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::row_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::col_major,
+         layout::col_major>(Q);
+
+    // joint_matrix_apply tests
 
     float D[MATRIX_M][MATRIX_N];
     big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
@@ -54,7 +73,6 @@ int main() {
     big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
     auto apply_add = [](auto &x) { x = x + 2; };
 
-    // joint_matrix_apply tests
     matrix_verify_lambda<bfloat16, float, 16, 16, 16>(Q, MD_f, 0.0, apply_add);
 
     matrix_verify_lambda<double, double, 8, 4, 8>(Q, MD_d, -60.0, apply_add);

From f7a360deecc69ce11629f5bc540a236ea38f29a5 Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Fri, 2 Feb 2024 00:14:44 +0800
Subject: [PATCH 11/14] [SYCL][NFC] Fix some 'startswith/endswith' related to
 SYCL (#12573)

Replace some deprecated 'startswith' and 'endswith' with 'starts_with'
and 'ends_with' to clear some warnings when building SYCL compiler.

---------

Signed-off-by: jinge90 <ge.jin@intel.com>
---
 clang/lib/CodeGen/CodeGenTypes.cpp                     |  4 ++--
 clang/lib/Driver/Driver.cpp                            |  4 ++--
 clang/lib/Driver/ToolChains/Linux.cpp                  |  2 +-
 clang/lib/Driver/ToolChains/SYCL.cpp                   |  9 +++++----
 clang/lib/Driver/ToolChains/SYCL.h                     |  2 +-
 clang/lib/Sema/SemaExpr.cpp                            | 10 +++++-----
 clang/lib/Sema/SemaSYCL.cpp                            |  2 +-
 clang/tools/clang-offload-deps/ClangOffloadDeps.cpp    |  2 +-
 llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp              |  4 ++--
 llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp              | 10 +++++-----
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp          |  2 +-
 llvm/lib/Support/PropertySetIO.cpp                     |  2 +-
 llvm/lib/Support/SimpleTable.cpp                       |  4 ++--
 llvm/lib/TargetParser/Triple.cpp                       |  2 +-
 llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp   |  2 +-
 .../Transforms/Instrumentation/SPIRITTAnnotations.cpp  |  6 +++---
 llvm/tools/sycl-post-link/ModuleSplitter.cpp           |  6 +++---
 llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp     |  2 +-
 llvm/tools/sycl-post-link/SpecConstants.cpp            |  8 ++++----
 llvm/tools/sycl-post-link/sycl-post-link.cpp           |  2 +-
 20 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 05fc203797f25..7642d3d79a666 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -342,8 +342,8 @@ llvm::Type *CodeGenTypes::ConvertSYCLJointMatrixINTELType(RecordDecl *RD) {
   if (CompTy->isStructTy()) {
     StringRef LlvmTyName = CompTy->getStructName();
     // Emit half/int16/float for sycl[::*]::{half,bfloat16,tf32}
-    if (LlvmTyName.startswith("class.sycl::") ||
-        LlvmTyName.startswith("class.__sycl_internal::"))
+    if (LlvmTyName.starts_with("class.sycl::") ||
+        LlvmTyName.starts_with("class.__sycl_internal::"))
       LlvmTyName = LlvmTyName.rsplit("::").second;
     if (LlvmTyName == "half") {
       CompTy = llvm::Type::getHalfTy(getLLVMContext());
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 21a563ce99c45..6265280877e9f 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3518,7 +3518,7 @@ getLinkerArgs(Compilation &C, DerivedArgList &Args, bool IncludeObj = false) {
   // manner than the OpenMP processing.  We should try and refactor this
   // to use the OpenMP flow (adding -l<name> to the llvm-link step)
   auto resolveStaticLib = [&](StringRef LibName, bool IsStatic) -> bool {
-    if (!LibName.startswith("-l"))
+    if (!LibName.starts_with("-l"))
       return false;
     for (auto &LPath : LibPaths) {
       if (!IsStatic) {
@@ -3663,7 +3663,7 @@ static bool IsSYCLDeviceLibObj(std::string ObjFilePath, bool isMSVCEnv) {
   StringRef ObjFileName = llvm::sys::path::filename(ObjFilePath);
   StringRef ObjSuffix = isMSVCEnv ? ".obj" : ".o";
   bool Ret =
-      (ObjFileName.startswith("libsycl-") && ObjFileName.endswith(ObjSuffix))
+      (ObjFileName.starts_with("libsycl-") && ObjFileName.ends_with(ObjSuffix))
           ? true
           : false;
   return Ret;
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index ae33f79198692..01887706d8757 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -344,7 +344,7 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs
   // libc++.so in D.Dir+"/../lib/". Detect this path.
   // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported.
-  if (StringRef(D.Dir).startswith(SysRoot) &&
+  if (StringRef(D.Dir).starts_with(SysRoot) &&
       (Args.hasArg(options::OPT_fsycl) ||
        D.getVFS().exists(D.Dir + "/../lib/libsycl.so")))
     addPathIfExists(D, D.Dir + "/../lib", Paths);
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 32ad99ba384b0..ca68b97858027 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -401,8 +401,9 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
           LibPostfix = ".cubin";
       }
       StringRef LibSyclPrefix("libsycl-");
-      if (!InputFilename.startswith(LibSyclPrefix) ||
-          !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2))
+      if (!InputFilename.starts_with(LibSyclPrefix) ||
+          !InputFilename.ends_with(LibPostfix) ||
+          (InputFilename.count('-') < 2))
         return false;
       // Skip the prefix "libsycl-"
       std::string PureLibName =
@@ -419,7 +420,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
             PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos);
       }
       for (const auto &L : SYCLDeviceLibList) {
-        if (StringRef(PureLibName).startswith(L))
+        if (StringRef(PureLibName).starts_with(L))
           return true;
       }
       return false;
@@ -1354,7 +1355,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple,
       for (auto *A : Args) {
         if (!A->getOption().matches(options::OPT_Xsycl_backend_EQ))
           continue;
-        if (StringRef(A->getValue()).startswith("intel_gpu"))
+        if (StringRef(A->getValue()).starts_with("intel_gpu"))
           TargArgs.push_back(A->getValue(1));
       }
       if (llvm::find_if(TargArgs, [&](auto Cur) {
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
index 8e1c97dbbb546..2e1a30be76eda 100644
--- a/clang/lib/Driver/ToolChains/SYCL.h
+++ b/clang/lib/Driver/ToolChains/SYCL.h
@@ -125,7 +125,7 @@ constexpr char AmdGPU[] = "amd_gpu_";
 template <auto GPUArh> std::optional<StringRef> isGPUTarget(StringRef Target) {
   // Handle target specifications that resemble '(intel, nvidia, amd)_gpu_*'
   // here.
-  if (Target.startswith(GPUArh)) {
+  if (Target.starts_with(GPUArh)) {
     return resolveGenDevice(Target);
   }
   return  std::nullopt;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index dcd567d32b698..2c5f907b4797e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -282,11 +282,11 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
             .Default(false);
       };
       if ((getEmissionReason(FDecl) == Sema::DeviceDiagnosticReason::Sycl) &&
-          Id && !Id->getName().startswith("__spirv_") &&
-          !Id->getName().startswith("__sycl_") &&
-          !Id->getName().startswith("__devicelib_ConvertBF16ToFINTEL") &&
-          !Id->getName().startswith("__devicelib_ConvertFToBF16INTEL") &&
-          !Id->getName().startswith("__assert_fail") &&
+          Id && !Id->getName().starts_with("__spirv_") &&
+          !Id->getName().starts_with("__sycl_") &&
+          !Id->getName().starts_with("__devicelib_ConvertBF16ToFINTEL") &&
+          !Id->getName().starts_with("__devicelib_ConvertFToBF16INTEL") &&
+          !Id->getName().starts_with("__assert_fail") &&
           !isMsvcMathFn(Id->getName())) {
         SYCLDiagIfDeviceCode(
             *Locs.begin(), diag::err_sycl_device_function_is_called_from_esimd,
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index e2a8d10ded1ab..d0f8ddbe7193c 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -4295,7 +4295,7 @@ static void CheckSYCL2020SubGroupSizes(Sema &S, FunctionDecl *SYCLKernel,
   // No need to validate __spirv routines here since they
   // are mapped to the equivalent SPIRV operations.
   const IdentifierInfo *II = FD->getIdentifier();
-  if (II && II->getName().startswith("__spirv_"))
+  if (II && II->getName().starts_with("__spirv_"))
     return;
 
   // Else we need to figure out why they don't match.
diff --git a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
index c9fa2f87a072f..5dbf044675106 100644
--- a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
+++ b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
@@ -173,7 +173,7 @@ int main(int argc, const char **argv) {
       // possibly reusing ClangOffloadBundler's 'OffloadTargetInfo'.
       for (const std::string &Target : Targets) {
         std::string Prefix = Target + ".";
-        if (Symbol.startswith(Prefix))
+        if (Symbol.starts_with(Prefix))
           Target2Symbols[Target].insert(
               Symbol.substr(Prefix.size(), Len - Prefix.size()));
       }
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
index e104ab115e970..480ba1944ebba 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
@@ -34,13 +34,13 @@ constexpr char SLM_ALLOCATOR_DTOR_SUFFIX[] = "EED2Ev";
 bool isSlmAllocatorConstructor(const Function &F) {
   auto Name = F.getName();
   return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) &&
-         Name.endswith(SLM_ALLOCATOR_CTOR_SUFFIX);
+         Name.ends_with(SLM_ALLOCATOR_CTOR_SUFFIX);
 }
 
 bool isSlmAllocatorDestructor(const Function &F) {
   auto Name = F.getName();
   return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) &&
-         Name.endswith(SLM_ALLOCATOR_DTOR_SUFFIX);
+         Name.ends_with(SLM_ALLOCATOR_DTOR_SUFFIX);
 }
 
 bool isSlmInit(const Function &F) {
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index 9a3aa8437a590..6ec1102f402ba 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -1596,15 +1596,15 @@ SmallPtrSet<Type *, 4> collectGenXVolatileTypes(Module &M) {
     // TODO FIXME relying on type name in LLVM IR is fragile, needs rework
     if (!GTy || !GTy->getName()
                      .rtrim(".0123456789")
-                     .endswith("sycl::_V1::ext::intel::esimd::simd"))
+                     .ends_with("sycl::_V1::ext::intel::esimd::simd"))
       continue;
     assert(GTy->getNumContainedTypes() == 1);
     auto VTy = GTy->getContainedType(0);
     if ((GTy = dyn_cast<StructType>(VTy))) {
-      assert(
-          GTy->getName()
-              .rtrim(".0123456789")
-              .endswith("sycl::_V1::ext::intel::esimd::detail::simd_obj_impl"));
+      assert(GTy->getName()
+                 .rtrim(".0123456789")
+                 .ends_with(
+                     "sycl::_V1::ext::intel::esimd::detail::simd_obj_impl"));
       VTy = GTy->getContainedType(0);
     }
     assert(VTy->isVectorTy());
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 264a8b4bc817a..2624e26116823 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -283,7 +283,7 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
 static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
   Function *Res;
   const char GetPrefix[] = "__dpcpp_nativecpu_get";
-  if (Name.startswith(GetPrefix)) {
+  if (Name.starts_with(GetPrefix)) {
     Res = addGetFunc(M, Name, StateType);
   } else if (Name == NativeCPUSetLocalId) {
     Res = addSetLocalIdFunc(M, Name, StateType);
diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp
index 2a3586ba49339..ffb6394913fc9 100644
--- a/llvm/lib/Support/PropertySetIO.cpp
+++ b/llvm/lib/Support/PropertySetIO.cpp
@@ -37,7 +37,7 @@ PropertySetRegistry::read(const MemoryBuffer *Buf) {
 
   for (line_iterator LI(*Buf); !LI.is_at_end(); LI++) {
     // see if this line starts a new property set
-    if (LI->startswith("[")) {
+    if (LI->starts_with("[")) {
       // yes - parse the category (property name)
       auto EndPos = LI->rfind(']');
       if (EndPos == StringRef::npos)
diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp
index 434e7d7cb7670..ec47391450100 100644
--- a/llvm/lib/Support/SimpleTable.cpp
+++ b/llvm/lib/Support/SimpleTable.cpp
@@ -214,8 +214,8 @@ Expected<SimpleTable::UPtrTy> SimpleTable::read(MemoryBuffer *Buf,
     return std::make_unique<SimpleTable>();
   UPtrTy Res;
 
-  if (LI->startswith(COL_TITLE_LINE_OPEN)) {
-    if (!LI->endswith(COL_TITLE_LINE_CLOSE))
+  if (LI->starts_with(COL_TITLE_LINE_OPEN)) {
+    if (!LI->ends_with(COL_TITLE_LINE_CLOSE))
       return createStringError(errc::invalid_argument, "malformed title line");
     // column titles present
     StringRef L = LI->substr(1, LI->size() - 2); // trim '[' and ']'
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index f0c4cd875f6fe..1e923ebae21f9 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -726,7 +726,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
       (SubArchName.ends_with("r6el") || SubArchName.ends_with("r6")))
     return Triple::MipsSubArch_r6;
 
-  if (SubArchName.startswith("spir")) {
+  if (SubArchName.starts_with("spir")) {
     StringRef SA(SubArchName);
     if (SA.consume_front("spir64_") || SA.consume_front("spir_")) {
       if (SA == "fpga")
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 1891e10679a0f..f37f1fdcfc2e9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -838,7 +838,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   //   //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
   // ```
   // can't be lowered by SPIR-V translator to "standard" format.
-  if (StringRef(Trunc.getModule()->getTargetTriple()).startswith("spir"))
+  if (StringRef(Trunc.getModule()->getTargetTriple()).starts_with("spir"))
     return nullptr;
 
   // Whenever an element is extracted from a vector, and then truncated,
diff --git a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
index b8dd7cda8883c..105643062aa90 100644
--- a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
@@ -243,7 +243,7 @@ bool insertAtomicInstrumentationCall(Module &M, StringRef Name,
 
 PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
-  assert(StringRef(M.getTargetTriple()).startswith("spir"));
+  assert(StringRef(M.getTargetTriple()).starts_with("spir"));
   bool IRModified = false;
   std::vector<StringRef> SPIRVCrossWGInstuctions = {
       SPIRV_CONTROL_BARRIER, SPIRV_GROUP_ALL,  SPIRV_GROUP_ANY,
@@ -299,7 +299,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
         if (std::any_of(SPIRVCrossWGInstuctions.begin(),
                         SPIRVCrossWGInstuctions.end(),
                         [&CalleeName](StringRef Name) {
-                          return CalleeName.startswith(Name);
+                          return CalleeName.starts_with(Name);
                         })) {
           Instruction *InstAfterBarrier = CI->getNextNode();
           const DebugLoc &DL = CI->getDebugLoc();
@@ -307,7 +307,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
           insertSimpleInstrumentationCall(M, ITT_ANNOTATION_WI_RESUME,
                                           InstAfterBarrier, DL);
           IRModified = true;
-        } else if (CalleeName.startswith(SPIRV_ATOMIC_INST)) {
+        } else if (CalleeName.starts_with(SPIRV_ATOMIC_INST)) {
           Instruction *InstAfterAtomic = CI->getNextNode();
           IRModified |= insertAtomicInstrumentationCall(
               M, ITT_ANNOTATION_ATOMIC_START, CI, CI, CalleeName);
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
index a31ad613f9918..87cbf42da2df2 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
@@ -79,7 +79,7 @@ bool isSpirvSyclBuiltin(StringRef FName) {
   // now skip the digits
   FName = FName.drop_while([](char C) { return std::isdigit(C); });
 
-  return FName.startswith("__spirv_") || FName.startswith("__sycl_");
+  return FName.starts_with("__spirv_") || FName.starts_with("__sycl_");
 }
 
 // Return true if the function is a ESIMD builtin
@@ -91,12 +91,12 @@ bool isESIMDBuiltin(StringRef FName) {
   // now skip the digits
   FName = FName.drop_while([](char C) { return std::isdigit(C); });
 
-  return FName.startswith("__esimd_");
+  return FName.starts_with("__esimd_");
 }
 
 // Return true if the function name starts with "__builtin_"
 bool isGenericBuiltin(StringRef FName) {
-  return FName.startswith("__builtin_");
+  return FName.starts_with("__builtin_");
 }
 
 bool isKernel(const Function &F) {
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 6ca581d8caf14..1b6cdefb9a541 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -696,7 +696,7 @@ uint32_t llvm::getSYCLDeviceLibReqMask(const Module &M) {
     return 0;
   uint32_t ReqMask = 0;
   for (const Function &SF : M) {
-    if (SF.getName().startswith(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) {
+    if (SF.getName().starts_with(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) {
       assert(SF.getCallingConv() == CallingConv::SPIR_FUNC);
       uint32_t DeviceLibBits = getDeviceLibBits(SF.getName().str());
       ReqMask |= DeviceLibBits;
diff --git a/llvm/tools/sycl-post-link/SpecConstants.cpp b/llvm/tools/sycl-post-link/SpecConstants.cpp
index dc5f1b2fd3f2c..66c2cc5ef1818 100644
--- a/llvm/tools/sycl-post-link/SpecConstants.cpp
+++ b/llvm/tools/sycl-post-link/SpecConstants.cpp
@@ -801,8 +801,8 @@ PreservedAnalyses SpecConstantsPass::run(Module &M,
     if (!F.isDeclaration())
       continue;
 
-    if (!F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) &&
-        !F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
+    if (!F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) &&
+        !F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
       continue;
 
     SmallVector<CallInst *, 32> SCIntrCalls;
@@ -1014,8 +1014,8 @@ bool SpecConstantsPass::collectSpecConstantDefaultValuesMetadata(
 
 bool llvm::checkModuleContainsSpecConsts(const Module &M) {
   for (const Function &F : M.functions()) {
-    if (F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) ||
-        F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
+    if (F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) ||
+        F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
       return true;
   }
 
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 69f43b8b486ea..50cb225f6cb18 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -340,7 +340,7 @@ std::string makeResultFileName(Twine Ext, int I, StringRef Suffix) {
                              : sys::path::parent_path(OutputFilename);
   const StringRef Sep = sys::path::get_separator();
   std::string Dir = Dir0.str();
-  if (!Dir0.empty() && !Dir0.endswith(Sep))
+  if (!Dir0.empty() && !Dir0.ends_with(Sep))
     Dir += Sep.str();
   return Dir + sys::path::stem(OutputFilename).str() + Suffix.str() + "_" +
          std::to_string(I) + Ext.str();

From 0af4ac7d2f38286c6e6ab32fe8f0c4437cb81d30 Mon Sep 17 00:00:00 2001
From: Michael Toguchi <michael.d.toguchi@intel.com>
Date: Thu, 1 Feb 2024 08:54:29 -0800
Subject: [PATCH 12/14] [Driver] Allow for -O3 on Windows using clang-cl
 (#12504)

We currently support -O3 for Linux compilations, expand this to also be
available on Windows. This also better aligns with our existing product
offerings.
---
 clang/include/clang/Driver/Options.td |  3 +++
 clang/lib/Driver/ToolChains/MSVC.cpp  |  9 +++++++--
 clang/test/Driver/Xarch.c             | 12 ++++++------
 clang/test/Driver/cl-options.c        |  4 ++++
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2d0f05f6ca35b..e32047292ab8a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8404,6 +8404,9 @@ def : CLFlag<"O1">, Alias<_SLASH_O>, AliasArgs<["1"]>,
   HelpText<"Optimize for size  (like /Og     /Os /Oy /Ob2 /GF /Gy)">;
 def : CLFlag<"O2">, Alias<_SLASH_O>, AliasArgs<["2"]>,
   HelpText<"Optimize for speed (like /Og /Oi /Ot /Oy /Ob2 /GF /Gy)">;
+def : CLFlag<"O3">, Alias<_SLASH_O>, AliasArgs<["3"]>,
+  HelpText<"Optimize for maximum speed and enable more aggressive optimizations"
+  " that may not improve performance on some programs">;
 def : CLFlag<"Ob0">, Alias<_SLASH_O>, AliasArgs<["b0"]>,
   HelpText<"Disable function inlining">;
 def : CLFlag<"Ob1">, Alias<_SLASH_O>, AliasArgs<["b1"]>,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index c1b912db596e8..52f811621ba82 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -930,6 +930,7 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL,
       break;
     case '1':
     case '2':
+    case '3':
     case 'x':
     case 'd':
       // Ignore /O[12xd] flags that aren't the last one on the command line.
@@ -946,11 +947,14 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL,
         } else if (OptChar == '2' || OptChar == 'x') {
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin));
           DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "2");
+        } else if (OptChar == '3') {
+          DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin));
+          DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "3");
         }
         if (SupportsForcingFramePointer &&
             !DAL.hasArgNoClaim(options::OPT_fno_omit_frame_pointer))
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_fomit_frame_pointer));
-        if (OptChar == '1' || OptChar == '2')
+        if (OptChar == '1' || OptChar == '2' || OptChar == '3')
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_ffunction_sections));
       }
       break;
@@ -1070,7 +1074,8 @@ MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
         // OptChar does not expand; it's an argument to the previous char.
         continue;
       }
-      if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || OptChar == 'd')
+      if (OptChar == '1' || OptChar == '2' || OptChar == 'x' ||
+          OptChar == 'd' || OptChar == '3')
         ExpandChar = OptStr.data() + I;
     }
   }
diff --git a/clang/test/Driver/Xarch.c b/clang/test/Driver/Xarch.c
index f7693fb689d58..ec58211d6fba0 100644
--- a/clang/test/Driver/Xarch.c
+++ b/clang/test/Driver/Xarch.c
@@ -1,10 +1,10 @@
-// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
-// O3ONCE: "-O3"
-// O3ONCE-NOT: "-O3"
+// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5ONCE %s
+// O5ONCE: "-O5"
+// O5ONCE-NOT: "-O5"
 
-// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s
-// O3NONE-NOT: "-O3"
-// O3NONE: argument unused during compilation: '-Xarch_i386 -O3'
+// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5NONE %s
+// O5NONE-NOT: "-O5"
+// O5NONE: argument unused during compilation: '-Xarch_i386 -O5'
 
 // RUN: not %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -o -Xarch_i386 -S %s -S -Xarch_i386 -o 2>&1 | FileCheck -check-prefix=INVALID %s
 // INVALID: error: invalid Xarch argument: '-Xarch_i386 -o'
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 5b6dfe308a76e..2f60637f7c010 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -207,6 +207,10 @@
 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O2 /O2 -### -- %s 2>&1 | FileCheck -check-prefix=O2O2 %s
 // O2O2: "-O2"
 
+// RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O3 -### -- %s 2>&1 | FileCheck -check-prefix=O3 %s
+// O3: -mframe-pointer=none
+// O3: -O3
+
 // RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1
 
 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s

From 4fdcb5835a3f8a25d0a519d4bb56a26bf5d6f318 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Thu, 1 Feb 2024 09:07:59 -0800
Subject: [PATCH 13/14] [SYCL] Fix compiler crash. (#12324)

The compiler was crashing when the user requested fp-accuracy for the
functions in a call of the form f1(f2(f3 ...), where f1, f2 and f3 were
fpbuiltin but the innermost function didn't have an fpbuiltin. The
current builtinID was used instead of getting the builtinID from the
current function. that created a crash in the compiler.
This patch fixes the issue and renames the function
EmitFPBuiltinIndirectCall to MaybeEmitFPBuiltinofFD .
---
 clang/lib/CodeGen/CGBuiltin.cpp             | 23 ++----
 clang/lib/CodeGen/CGCall.cpp                |  5 +-
 clang/lib/CodeGen/CodeGenFunction.h         |  7 +-
 clang/test/CodeGen/fp-accuracy.c            | 87 ++++++++++++++++++++-
 sycl/test/check_device_code/fp-accuracy.cpp | 29 +++++++
 5 files changed, 127 insertions(+), 24 deletions(-)
 create mode 100644 sycl/test/check_device_code/fp-accuracy.cpp

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ca1752b24b062..0a90073cf50d0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -23271,21 +23271,11 @@ static bool hasFuncNameRequestedFPAccuracy(StringRef Name,
   return (FuncMapIt != LangOpts.FPAccuracyFuncMap.end());
 }
 
-llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
+llvm::CallInst *CodeGenFunction::MaybeEmitFPBuiltinofFD(
     llvm::FunctionType *IRFuncTy, const SmallVectorImpl<llvm::Value *> &IRArgs,
-    llvm::Value *FnPtr, const FunctionDecl *FD) {
-  llvm::Function *Func;
+    llvm::Value *FnPtr, StringRef Name, unsigned FDBuiltinID) {
   unsigned FPAccuracyIntrinsicID = 0;
-  StringRef Name;
-  if (CurrentBuiltinID == 0) {
-    // Even if the current function doesn't have a clang builtin, create
-    // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
-    // an NoBuiltin attribute.
-    if (FD->hasAttr<NoBuiltinAttr>() ||
-        !FD->getNameInfo().getName().isIdentifier())
-      return nullptr;
-
-    Name = FD->getName();
+  if (FDBuiltinID == 0) {
     FPAccuracyIntrinsicID =
         llvm::StringSwitch<unsigned>(Name)
             .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
@@ -23300,9 +23290,7 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
   } else {
     // The function has a clang builtin. Create an attribute for it
     // only if it has an fpbuiltin intrinsic.
-    unsigned BuiltinID = getCurrentBuiltinID();
-    Name = CGM.getContext().BuiltinInfo.getName(BuiltinID);
-    switch (BuiltinID) {
+    switch (FDBuiltinID) {
     default:
       // If the function has a clang builtin but doesn't have an
       // fpbuiltin, it will be generated with no 'fpbuiltin-max-error'
@@ -23384,7 +23372,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
   const LangOptions &LangOpts = getLangOpts();
   if (hasFuncNameRequestedFPAccuracy(Name, LangOpts) ||
       !LangOpts.FPAccuracyVal.empty()) {
-    Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
+    llvm::Function *Func =
+        CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
     return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
                                      FPAccuracyIntrinsicID);
   }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d55023875f2fc..442059fb03789 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5707,8 +5707,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     if (!getLangOpts().FPAccuracyFuncMap.empty() ||
         !getLangOpts().FPAccuracyVal.empty()) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
-      if (FD) {
-        CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);
+      if (FD && FD->getNameInfo().getName().isIdentifier()) {
+        CI = MaybeEmitFPBuiltinofFD(IRFuncTy, IRCallArgs, CalleePtr,
+                                    FD->getName(), FD->getBuiltinID());
         if (CI)
           return RValue::get(CI);
       }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 3ddd05cb53d8e..04c3a715da205 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4434,9 +4434,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   RValue EmitIntelSYCLPtrAnnotationBuiltin(const CallExpr *E);
 
   llvm::CallInst *
-  EmitFPBuiltinIndirectCall(llvm::FunctionType *IRFuncTy,
-                            const SmallVectorImpl<llvm::Value *> &IRArgs,
-                            llvm::Value *FnPtr, const FunctionDecl *FD);
+  MaybeEmitFPBuiltinofFD(llvm::FunctionType *IRFuncTy,
+                         const SmallVectorImpl<llvm::Value *> &IRArgs,
+                         llvm::Value *FnPtr, StringRef Name,
+                         unsigned FDBuiltinID);
 
   enum class MSVCIntrin;
   llvm::Value *EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E);
diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c
index 74de6a1f72c80..3fdde4443b8c8 100644
--- a/clang/test/CodeGen/fp-accuracy.c
+++ b/clang/test/CodeGen/fp-accuracy.c
@@ -177,7 +177,7 @@ double rsqrt(double);
 // CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
-// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
@@ -203,6 +203,19 @@ double rsqrt(double);
 // CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]]
 // CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 
+// CHECK-F3-LABEL: define dso_local void @f2
+// CHECK-F3: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW]]
+// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM]]
+// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]]
+
+// CHECK-F3-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F3-LABEL: define dso_local void @f4
+// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+
 // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0"
@@ -414,7 +427,7 @@ void f1(float a, float b) {
 // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]]
 // CHECK-F2: call float @tanf(float {{.*}})
 //
-// CHECK-LABEL-F4: define dso_local void @f2
+// CHECK-F4-LABEL: define dso_local void @f2
 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
@@ -422,6 +435,17 @@ void f1(float a, float b) {
 // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call float @tanf(float {{.*}})
 //
+// CHECK-F4-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F4-LABEL: define dso_local void @f3
+// CHECK-F4: call float @fake_exp10(float {{.*}})
+
+// CHECK-F4-LABEL: define dso_local void @f4
+// CHECK-F4: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
+// CHECK-F4: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
+
+// CHECK-F4: attributes #[[ATTR_F4_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
+
 // CHECK-F5-LABEL: define dso_local void @f2
 // CHECK-F5: call float @llvm.cos.f32(float {{.*}})
 // CHECK-F5: call float @llvm.sin.f32(float {{.*}})
@@ -430,6 +454,15 @@ void f1(float a, float b) {
 // CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}})
 // CHECK-F5: call float @tanf(float {{.*}})
 //
+// CHECK-F5-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F5-LABEL: define dso_local void @f3
+// CHECK-F5: call float @fake_exp10(float {{.*}})
+
+// CHECK-F5-LABEL: define dso_local void @f4
+// CHECK-F5: call double @llvm.exp.f64(double {{.*}})
+// CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM]]
+
 // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 //
@@ -441,6 +474,15 @@ void f1(float a, float b) {
 // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]]
 // CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]]
 //
+// CHECK-F6-LABEL: define dso_local float @fake_exp10
+//
+// CHECK-F6-LABEL: define dso_local void @f3
+// CHECK-F6: call float @fake_exp10(float {{.*}})
+
+// CHECK-F6-LABEL: define dso_local void @f4
+// CHECK-F6: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]]
+// CHECK-F6: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]]
+
 // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 //
@@ -454,12 +496,36 @@ void f1(float a, float b) {
 
 // CHECK-LABEL: define dso_local void @f3
 // CHECK: call float @fake_exp10(float {{.*}})
+
+// CHECK-LABEL: define dso_local void @f4
+// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]]
+
+// CHECK-F1-LABEL: define dso_local void @f3
 // CHECK-F1: call float @fake_exp10(float {{.*}})
+
+// CHECK-F1-LABEL: define dso_local void @f4
+// CHECK-F1: call double @llvm.exp.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]]
+
+// CHECK-F2-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F2-LABEL: define dso_local void @f3
 // CHECK-F2: call float @fake_exp10(float {{.*}})
 
+// CHECK-F2-LABEL: define dso_local void @f4
+// CHECK-F2: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.cos.f64(double {{.*}})  #[[ATTR_F2_CUDA]]
+
+// CHECK-SPIR-LABEL: define dso_local spir_func float @fake_exp10
+//
 // CHECK-SPIR-LABEL: define dso_local spir_func void @f3
 // CHECK-SPIR: call spir_func float @fake_exp10(float {{.*}})
 
+// CHECK-SPIR-LABEL: define dso_local spir_func void @f4
+// CHECK-SPIR: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_SYCL1]]
+
 // CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 
 // CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
@@ -526,6 +592,10 @@ void f1(float a, float b) {
 // CHECK-DEFAULT-LABEL: define dso_local void @f3
 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}})
 
+// CHECK-DEFAULT-LABEL: define dso_local void @f4
+// CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}})
+
 void f2(float a, float b) {
   float sin = 0.f, cos = 0.f;
 
@@ -541,3 +611,16 @@ float fake_exp10(float a) __attribute__((no_builtin)){}
 void f3(float a, float b) {
   a = fake_exp10(b);
 }
+
+#define sz 2
+double in[sz];
+double out[sz];
+
+double getInput(int i) {
+  return in[i];
+}
+
+void f4() {
+  for (int i = 0; i < sz; i++)
+    out[i] = cos(exp(getInput(i)));
+}
diff --git a/sycl/test/check_device_code/fp-accuracy.cpp b/sycl/test/check_device_code/fp-accuracy.cpp
new file mode 100644
index 0000000000000..f5a42c2bbc436
--- /dev/null
+++ b/sycl/test/check_device_code/fp-accuracy.cpp
@@ -0,0 +1,29 @@
+// DEFINE: %{common_opts} = -fsycl -fsycl-device-only -fno-math-errno \
+// DEFINE: -ffp-accuracy=high -S -emit-llvm -o - %s
+
+// RUN: %clangxx %{common_opts} | FileCheck %s
+
+// RUN: %clangxx %{common_opts} -ffp-accuracy=low:exp \
+// RUN: | FileCheck %s --check-prefix=CHECK-LOW-EXP
+
+#include <sycl/sycl.hpp>
+
+SYCL_EXTERNAL auto foo(double x) {
+  using namespace sycl;
+  return cos(exp(log(x)));
+}
+
+// CHECK-LABEL: define {{.*}}food
+// CHECK: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_HIGH:[0-9]+]]
+// CHECK: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]]
+
+// CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
+
+// CHECK-LOW-EXP-LABEL: define {{.*}}food
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]]
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]]
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]]
+
+// CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
+// CHECK-F1: attributes #[[ATTR_F1_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0"

From 8427bd224b3db2fa7df6138156620fec4b5ed323 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Thu, 1 Feb 2024 17:12:12 +0000
Subject: [PATCH 14/14] [SYCL][HIP][CUDA] Use new version of
 piMemGetNativeHandle and add test (#12297)

We want to change the signature of `piMemGetNativeHandle` for reasons
explained here https://github.com/oneapi-src/unified-runtime/pull/1199

Corresponding UR PR:
https://github.com/oneapi-src/unified-runtime/pull/1226

A previous PR added a new entry point
https://github.com/intel/llvm/pull/12199 but it was decided that it is
better to modify the existing entry point
---
 sycl/include/sycl/detail/pi.h                 |   9 +-
 sycl/plugins/cuda/pi_cuda.cpp                 |   5 +-
 sycl/plugins/hip/pi_hip.cpp                   |   5 +-
 sycl/plugins/level_zero/pi_level_zero.cpp     |   5 +-
 sycl/plugins/native_cpu/pi_native_cpu.cpp     |   5 +-
 sycl/plugins/opencl/pi_opencl.cpp             |   5 +-
 sycl/plugins/unified_runtime/CMakeLists.txt   |  14 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        |   5 +-
 .../unified_runtime/pi_unified_runtime.cpp    |   6 +-
 sycl/source/detail/buffer_impl.cpp            |   6 +-
 sycl/source/detail/memory_manager.cpp         |  12 +-
 sycl/source/interop_handle.cpp                |   4 +-
 .../HostInteropTask/interop-task-hip.cpp      | 136 ++++++++++++++++++
 sycl/unittests/helpers/PiMockPlugin.hpp       |   2 +-
 14 files changed, 188 insertions(+), 31 deletions(-)
 create mode 100644 sycl/test-e2e/HostInteropTask/interop-task-hip.cpp

diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 4d0da9995908a..31c6a86f3300d 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -149,9 +149,11 @@
 // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones.
 // 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM
 // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM
+// 15.43 Changed the signature of piextMemGetNativeHandle to also take a
+// pi_device
 
-#define _PI_H_VERSION_MAJOR 14
-#define _PI_H_VERSION_MINOR 42
+#define _PI_H_VERSION_MAJOR 15
+#define _PI_H_VERSION_MINOR 43
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1424,8 +1426,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition(
 /// Gets the native handle of a PI mem object.
 ///
 /// \param mem is the PI mem to get the native handle of.
+/// \param dev is the PI device that the native allocation will be resident on
 /// \param nativeHandle is the native handle of mem.
-__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem,
+__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, pi_device dev,
                                                 pi_native_handle *nativeHandle);
 
 /// Creates PI mem object from a native handle.
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 6d50d120aa2f9..de715de0835fd 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -233,8 +233,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 266c72a3b3587..126ada92348f6 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -236,8 +236,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 79e047850af88..0fc36a231be6c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -243,8 +243,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp
index c7e71f9791d35..48ce104a94e90 100644
--- a/sycl/plugins/native_cpu/pi_native_cpu.cpp
+++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp
@@ -240,8 +240,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 3e7f3aea4dfed..c09be92f89406 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -222,8 +222,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index af56733f4b72f..c990359e39e96 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
-  # commit 3225b822b5d8cbfa85d7fc1bd5a5bf96e5bb8c1a
-  # Merge: edb281f3 5fc41099
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  # commit d216eb44d5c9fe3433eecdd09b10e3e79ac25bd7
+  # Merge: 40517d2b fc1f3066
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Tue Jan 30 12:31:44 2024 +0000
-  #     Merge pull request #1168 from Seanst98/sean/unique-addr-mode-per-dim-adapters
-  #     [Bindless][CUDA] Unique addressing modes per dimension
-  set(UNIFIED_RUNTIME_TAG 3225b822b5d8cbfa85d7fc1bd5a5bf96e5bb8c1a)
+  # Date:   Wed Jan 31 10:38:07 2024 +0000
+  #     Merge pull request #1226 from hdelan/get-native-mem-on-device2
+  #     [UR] Add extra param to urMemGetNativeHandle
+  set(UNIFIED_RUNTIME_TAG d216eb44d5c9fe3433eecdd09b10e3e79ac25bd7)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index df841b786bfb1..c19c93a6af53a 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3081,13 +3081,14 @@ inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags,
   return PI_SUCCESS;
 }
 
-inline pi_result piextMemGetNativeHandle(pi_mem Mem,
+inline pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
                                          pi_native_handle *NativeHandle) {
   PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
 
   ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+  ur_device_handle_t UrDev = reinterpret_cast<ur_device_handle_t>(Dev);
   ur_native_handle_t NativeMem{};
-  HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem));
+  HANDLE_ERRORS(urMemGetNativeHandle(UrMem, UrDev, &NativeMem));
 
   *NativeHandle = reinterpret_cast<pi_native_handle>(NativeMem);
 
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index adbeb652bf613..b9742b8697fa8 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -235,9 +235,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition(
                                      BufferCreateInfo, RetMem);
 }
 
-__SYCL_EXPORT pi_result
-piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+__SYCL_EXPORT pi_result piextMemGetNativeHandle(
+    pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 __SYCL_EXPORT pi_result
diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index 04c055465a9cf..835c732a40bf9 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -84,7 +84,11 @@ buffer_impl::getNativeVector(backend BackendName) const {
     }
 
     pi_native_handle Handle;
-    Plugin->call<PiApiKind::piextMemGetNativeHandle>(NativeMem, &Handle);
+    // When doing buffer interop we don't know what device the memory should be
+    // resident on, so pass nullptr for Device param. Buffer interop may not be
+    // supported by all backends.
+    Plugin->call<PiApiKind::piextMemGetNativeHandle>(NativeMem, /*Dev*/ nullptr,
+                                                     &Handle);
     Handles.push_back(Handle);
   }
 
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 0daa53587ed4d..e59fb94a09f65 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -144,7 +144,11 @@ void memBufferCreateHelper(const PluginPtr &Plugin, pi_context Ctx,
       // Always use call_nocheck here, because call may throw an exception,
       // and this lambda will be called from destructor, which in combination
       // rewards us with UB.
-      Plugin->call_nocheck<PiApiKind::piextMemGetNativeHandle>(*RetMem, &Ptr);
+      // When doing buffer interop we don't know what device the memory should
+      // be resident on, so pass nullptr for Device param. Buffer interop may
+      // not be supported by all backends.
+      Plugin->call_nocheck<PiApiKind::piextMemGetNativeHandle>(
+          *RetMem, /*Dev*/ nullptr, &Ptr);
       emitMemAllocEndTrace(MemObjID, (uintptr_t)(Ptr), Size, 0 /* guard zone */,
                            CorrID);
     }};
@@ -167,7 +171,11 @@ void memReleaseHelper(const PluginPtr &Plugin, pi_mem Mem) {
   // Do not make unnecessary PI calls without instrumentation enabled
   if (xptiTraceEnabled()) {
     pi_native_handle PtrHandle = 0;
-    Plugin->call<PiApiKind::piextMemGetNativeHandle>(Mem, &PtrHandle);
+    // When doing buffer interop we don't know what device the memory should be
+    // resident on, so pass nullptr for Device param. Buffer interop may not be
+    // supported by all backends.
+    Plugin->call<PiApiKind::piextMemGetNativeHandle>(Mem, /*Dev*/ nullptr,
+                                                     &PtrHandle);
     Ptr = (uintptr_t)(PtrHandle);
   }
 #endif
diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp
index 0b0ab39199370..cd479493bbae3 100644
--- a/sycl/source/interop_handle.cpp
+++ b/sycl/source/interop_handle.cpp
@@ -34,8 +34,8 @@ pi_native_handle interop_handle::getNativeMem(detail::Requirement *Req) const {
 
   auto Plugin = MQueue->getPlugin();
   pi_native_handle Handle;
-  Plugin->call<detail::PiApiKind::piextMemGetNativeHandle>(Iter->second,
-                                                           &Handle);
+  Plugin->call<detail::PiApiKind::piextMemGetNativeHandle>(
+      Iter->second, MDevice->getHandleRef(), &Handle);
   return Handle;
 }
 
diff --git a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp
new file mode 100644
index 0000000000000..1f98b2da24c33
--- /dev/null
+++ b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp
@@ -0,0 +1,136 @@
+// FIXME: the rocm include path and link path are highly platform dependent,
+// we should set this with some variable instead.
+// RUN: %{build} -o %t.out -I/opt/rocm/include -L/opt/rocm/lib -lamdhip64
+// RUN: %{run} %t.out
+// REQUIRES: hip
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+#define __HIP_PLATFORM_AMD__
+
+#include <hip/hip_runtime.h>
+
+using namespace sycl;
+using namespace sycl::access;
+
+static constexpr size_t BUFFER_SIZE = 1024;
+
+template <typename T> class Modifier;
+
+template <typename T> class Init;
+
+template <typename BufferT, typename ValueT>
+void checkBufferValues(BufferT Buffer, ValueT Value) {
+  auto Acc = Buffer.get_host_access();
+  for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+    if (Acc[Idx] != Value) {
+      std::cerr << "buffer[" << Idx << "] = " << Acc[Idx]
+                << ", expected val = " << Value << '\n';
+      exit(1);
+    }
+  }
+}
+
+template <typename DataT>
+void copy(buffer<DataT, 1> &Src, buffer<DataT, 1> &Dst, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto SrcA = Src.template get_access<mode::read>(CGH);
+    auto DstA = Dst.template get_access<mode::write>(CGH);
+
+    auto Func = [=](interop_handle IH) {
+      auto HipStream = IH.get_native_queue<backend::ext_oneapi_hip>();
+      auto SrcMem = IH.get_native_mem<backend::ext_oneapi_hip>(SrcA);
+      auto DstMem = IH.get_native_mem<backend::ext_oneapi_hip>(DstA);
+
+      if (hipMemcpyWithStream(DstMem, SrcMem, sizeof(DataT) * SrcA.get_count(),
+                              hipMemcpyDefault, HipStream) != hipSuccess) {
+        throw;
+      }
+
+      if (hipStreamSynchronize(HipStream) != hipSuccess) {
+        throw;
+      }
+
+      if (Q.get_backend() != IH.get_backend())
+        throw;
+    };
+    CGH.host_task(Func);
+  });
+}
+
+template <typename DataT> void modify(buffer<DataT, 1> &B, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc = B.template get_access<mode::read_write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] += 1; };
+
+    CGH.parallel_for<Modifier<DataT>>(Acc.get_count(), Kernel);
+  });
+}
+
+template <typename DataT, DataT B1Init, DataT B2Init>
+void init(buffer<DataT, 1> &B1, buffer<DataT, 1> &B2, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc1 = B1.template get_access<mode::write>(CGH);
+    auto Acc2 = B2.template get_access<mode::write>(CGH);
+
+    CGH.parallel_for<Init<DataT>>(BUFFER_SIZE, [=](item<1> Id) {
+      Acc1[Id] = B1Init;
+      Acc2[Id] = B2Init;
+    });
+  });
+}
+
+// Check that a single host-interop-task with a buffer will work.
+void test_ht_buffer(queue &Q) {
+  buffer<int, 1> Buffer{BUFFER_SIZE};
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer.get_access<mode::write>(CGH);
+    auto Func = [=](interop_handle IH) { /*A no-op */ };
+    CGH.host_task(Func);
+  });
+}
+
+// A test that uses HIP interop to copy data from buffer A to buffer B, by
+// getting HIP ptrs and calling the hipMemcpyWithStream. Then run a SYCL
+// kernel that modifies the data in place for B, e.g. increment one, then copy
+// back to buffer A. Run it on a loop, to ensure the dependencies and the
+// reference counting of the objects is not leaked.
+void test_ht_kernel_dependencies(queue &Q) {
+  static constexpr int COUNT = 4;
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  // Init the buffer with a'priori invalid data.
+  init<int, -1, -2>(Buffer1, Buffer2, Q);
+
+  // Repeat a couple of times.
+  for (size_t Idx = 0; Idx < COUNT; ++Idx) {
+    copy(Buffer1, Buffer2, Q);
+    modify(Buffer2, Q);
+    copy(Buffer2, Buffer1, Q);
+  }
+
+  checkBufferValues(Buffer1, COUNT - 1);
+  checkBufferValues(Buffer2, COUNT - 1);
+}
+
+void tests(queue &Q) {
+  test_ht_buffer(Q);
+  test_ht_kernel_dependencies(Q);
+}
+
+int main() {
+  queue Q([](sycl::exception_list ExceptionList) {
+    if (ExceptionList.size() != 1) {
+      std::cerr << "Should be one exception in exception list" << std::endl;
+      std::abort();
+    }
+    std::rethrow_exception(*ExceptionList.begin());
+  });
+  tests(Q);
+  std::cout << "Test PASSED" << std::endl;
+  return 0;
+}
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index c1f0a58f82274..31eac5598f588 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -613,7 +613,7 @@ mock_piMemBufferPartition(pi_mem buffer, pi_mem_flags flags,
   return PI_SUCCESS;
 }
 
-inline pi_result mock_piextMemGetNativeHandle(pi_mem mem,
+inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, pi_device dev,
                                               pi_native_handle *nativeHandle) {
   *nativeHandle = reinterpret_cast<pi_native_handle>(mem);
   return PI_SUCCESS;