diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
similarity index 97%
rename from llvm/tools/sycl-post-link/ModuleSplitter.h
rename to llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
index aaf6108325765..eb09e7528ff49 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.h
+++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
@@ -10,7 +10,8 @@
 // of the split is new modules containing corresponding callgraph.
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
+#define LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -18,6 +19,7 @@
 #include "llvm/Support/Error.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 namespace llvm {
@@ -229,8 +231,8 @@ class ModuleSplitterBase {
   // For device global variables with the 'device_image_scope' property,
   // the function checks that there are no usages of a single device global
   // variable from kernels grouped to different modules. Otherwise, an error is
-  // issued and the tool is aborted.
-  void verifyNoCrossModuleDeviceGlobalUsage();
+  // returned.
+  Error verifyNoCrossModuleDeviceGlobalUsage();
 
   virtual ~ModuleSplitterBase() = default;
 
@@ -262,3 +264,5 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
 } // namespace module_split
 
 } // namespace llvm
+
+#endif // LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index cfafbe1a66c87..b2afa8150aa4f 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -62,6 +62,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   LowerInvokeSimd.cpp
   LowerWGLocalMemory.cpp
   LowerWGScope.cpp
+  ModuleSplitter.cpp
   MutatePrintfAddrspace.cpp
   SYCLAddOptLevelAttribute.cpp
   SYCLPropagateAspectsUsage.cpp
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index 6ec1102f402ba..15c5e7c9a625e 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) {
   CI.replaceAllUsesWith(LI);
 }
 
+static void translateScatterStore(CallInst &CI, bool IsSLM) {
+  IRBuilder<> Builder(&CI);
+  constexpr int AlignmentTemplateArgIdx = 2;
+  APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
+                               ESIMDIntrinDesc::GenXArgConversion::TO_I64);
+  Align AlignValue(Val.getZExtValue());
+
+  auto ValsOp = CI.getArgOperand(0);
+  auto OffsetsOp = CI.getArgOperand(1);
+  auto MaskOp = CI.getArgOperand(2);
+  auto DataType = ValsOp->getType();
+
+  // Convert the mask from <N x i16> to <N x i1>.
+  Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
+  MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);
+
+  // The address space may be 3-SLM, 1-global or private.
+  // At the moment of calling 'scatter()' operation the pointer passed to it
+  // is already 4-generic. Thus, simply use 4-generic for global and private
+  // and let GPU BE deduce the actual address space from the use-def graph.
+  unsigned AS = IsSLM ? 3 : 4;
+  auto ElemType = DataType->getScalarType();
+  auto NumElems = (cast<VectorType>(DataType))->getElementCount();
+  auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
+  auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);
+
+  auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp);
+  SI->setDebugLoc(CI.getDebugLoc());
+  CI.replaceAllUsesWith(SI);
+}
+
 // TODO Specify document behavior for slm_init and nbarrier_init when:
 // 1) they are called not from kernels
 // 2) there are multiple such calls reachable from a kernel
@@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
         continue;
       }
 
+      if (Name.starts_with("__esimd_scatter_st") ||
+          Name.starts_with("__esimd_slm_scatter_st")) {
+        translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st"));
+        ToErase.push_back(CI);
+        continue;
+      }
+
       if (Name.starts_with("__esimd_nbarrier_init")) {
         translateNbarrierInit(*CI);
         ToErase.push_back(CI);
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
similarity index 97%
rename from llvm/tools/sycl-post-link/ModuleSplitter.cpp
rename to llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index 87cbf42da2df2..92ff992141945 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -8,9 +8,7 @@
 // See comments in the header.
 //===----------------------------------------------------------------------===//
 
-#include "ModuleSplitter.h"
-#include "Support.h"
-
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -23,6 +21,7 @@
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
@@ -426,14 +425,15 @@ class ModuleSplitter : public ModuleSplitterBase {
   DependencyGraph CG;
 };
 } // namespace
+
 namespace llvm {
 namespace module_split {
 
-void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
+Error ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
   const Module &M = getInputModule();
   // Early exit if there is only one group
   if (Groups.size() < 2)
-    return;
+    return Error::success();
 
   // Reverse the EntryPointGroupMap to get a map of entry point -> module's name
   unsigned EntryPointNumber = 0;
@@ -451,19 +451,25 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
 
     std::optional<StringRef> VarEntryPointModule{};
     auto CheckEntryPointModule = [&VarEntryPointModule, &EntryPointModules,
-                                  &GV](const auto *F) {
+                                  &GV](const auto *F) -> Error {
       auto EntryPointModulesIt = EntryPointModules.find(F);
-      assert(EntryPointModulesIt != EntryPointModules.end() &&
-             "There is no group for an entry point");
+      if (EntryPointModulesIt == EntryPointModules.end())
+        return createStringError(inconvertibleErrorCode(),
+                                 "There is no group for an entry point");
+
       if (!VarEntryPointModule.has_value()) {
         VarEntryPointModule = EntryPointModulesIt->second;
-        return;
-      }
-      if (EntryPointModulesIt->second != *VarEntryPointModule) {
-        error("device_global variable '" + Twine(GV.getName()) +
-              "' with property \"device_image_scope\" is used in more "
-              "than one device image.");
+        return Error::success();
       }
+
+      if (EntryPointModulesIt->second != *VarEntryPointModule)
+        return createStringError(
+            inconvertibleErrorCode(),
+            "device_global variable '" + Twine(GV.getName()) +
+                "' with property \"device_image_scope\" is used in more "
+                "than one device image.");
+
+      return Error::success();
     };
 
     SmallSetVector<const User *, 32> Workqueue;
@@ -478,13 +484,18 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
         continue;
       }
       if (auto *F = dyn_cast<const Function>(U)) {
-        if (EntryPointModules.count(F))
-          CheckEntryPointModule(F);
+        if (EntryPointModules.count(F)) {
+          auto E = CheckEntryPointModule(F);
+          if (E)
+            return E;
+        }
       }
       for (auto *UU : U->users())
         Workqueue.insert(UU);
     }
   }
+
+  return Error::success();
 }
 
 #ifndef NDEBUG
diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt
index 40bd3f899e487..3905e836aaae8 100644
--- a/llvm/tools/sycl-post-link/CMakeLists.txt
+++ b/llvm/tools/sycl-post-link/CMakeLists.txt
@@ -24,7 +24,6 @@ include_directories(
 
 add_llvm_tool(sycl-post-link
   sycl-post-link.cpp
-  ModuleSplitter.cpp
   SpecConstants.cpp
   SYCLDeviceLibReqMask.cpp
   SYCLKernelParamOptInfo.cpp
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
index 70d3a90f51785..5255ce7bf2a66 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "SYCLDeviceRequirements.h"
-#include "ModuleSplitter.h"
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/Support/PropertySetIO.h"
 
 #include <set>
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 85728be8aabf0..068962b63c1fc 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -13,7 +13,6 @@
 // - specialization constant intrinsic transformation
 //===----------------------------------------------------------------------===//
 
-#include "ModuleSplitter.h"
 #include "SYCLDeviceLibReqMask.h"
 #include "SYCLDeviceRequirements.h"
 #include "SYCLKernelParamOptInfo.h"
@@ -40,6 +39,7 @@
 #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
 #include "llvm/SYCLLowerIR/HostPipes.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -1009,8 +1009,11 @@ processInputModule(std::unique_ptr<Module> M) {
   Modified |= SplitOccurred;
 
   // FIXME: this check is not performed for ESIMD splits
-  if (DeviceGlobals)
-    Splitter->verifyNoCrossModuleDeviceGlobalUsage();
+  if (DeviceGlobals) {
+    auto E = Splitter->verifyNoCrossModuleDeviceGlobalUsage();
+    if (E)
+      error(toString(std::move(E)));
+  }
 
   // It is important that we *DO NOT* preserve all the splits in memory at the
   // same time, because it leads to a huge RAM consumption by the tool on bigger
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index c9d9e2b7eba90..ccc0f642f992f 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -567,8 +567,8 @@ Unlike other AOT targets, the bitcode module linked from intermediate compiled
 objects never goes through SPIR-V. Instead it is passed directly in bitcode form
 down to the NVPTX Back End. All produced bitcode depends on two libraries,
 `libdevice.bc` (provided by the CUDA SDK) and `libspirv-nvptx64--nvidiacl.bc` variants
-(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. 
-Instead it is used to generate remangled variants 
+(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly.
+Instead it is used to generate remangled variants
 `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` and
 `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` to handle primitive type
 differences between Linux and Windows.
@@ -600,14 +600,14 @@ path in SYCL kernels.
 
 ##### NVPTX Builtins
 
-Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` 
+Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long`
 types as 64 bit and has no `long long` types while Windows DPC++ treats `long`
-types like 32-bit integers and `long long` types like 64-bit integers. 
-Differences between the primitive types can cause applications to use 
-incompatible libclc built-ins. A remangler creates multiple libspriv files 
-with different remangled function names to support both Windows and Linux. 
-When building a SYCL application targeting the CUDA backend the driver 
-will link the device code with 
+types like 32-bit integers and `long long` types like 64-bit integers.
+Differences between the primitive types can cause applications to use
+incompatible libclc built-ins. A remangler creates multiple libspirv files
+with different remangled function names to support both Windows and Linux.
+When building a SYCL application targeting the CUDA backend the driver
+will link the device code with
 `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
 Windows or it will link the device code with
 `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
@@ -916,7 +916,7 @@ template <typename T, address_space AS> class multi_ptr {
   // DecoratedType<T, global_space>::type == "__attribute__((opencl_global)) T"
   // See sycl/include/sycl/access/access.hpp for more details
   using pointer_t = typename DecoratedType<T, AS>::type *;
- 
+
   pointer_t m_Pointer;
   public:
   pointer_t get() { return m_Pointer; }
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
index 41700a898fecb..e4eb444f1ed60 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
@@ -10,6 +10,7 @@
 :encoding: utf-8
 :lang: en
 :dpcpp: pass:[DPC++]
+:endnote: &#8212;{nbsp}end{nbsp}note
 
 // Set the default source code type in this document to C++,
 // for syntax highlighting purposes.  This is needed because
@@ -101,8 +102,8 @@ available only in host code as noted below.
 The complex type is trivially copyable and type trait `is_device_copyable`
 should resolve to `std::true_type`.
 
-The `T` template parameter must be one of the types float, double, or
-sycl::half.
+_Constraints_: The `T` template parameter must be one of the types `float`,
+`double`, or `sycl::half`.
 
 Note: When performing operations between complex numbers and decimals,
 the decimal is treated as a complex number with a real component equal to
@@ -335,6 +336,13 @@ Additionally, this extension introduces support for the `real` and `imag` free
 functions, which returns the real and imaginary component of a number,
 respectively.
 
+[_Note:_ The overloads of the functions `real(T)` and `imag(T)` match the
+behavior in ISO C++ where `T` would be treated as a complex number with a zero
+imaginary component. This is subject to the constraint that `T` must be one of
+the types `float`, `double`, `sycl::half`, or evaluate to `true` for
+`std::is_integral`.
+_{endnote}_]
+
 These functions are available in both host and device code, and each math
 function should follow the C++ standard for handling `NaN` and `Inf` values.
 
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
index 4c1df1640d021..227b23f1ffdac 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc
@@ -20,7 +20,7 @@
 == Notice
 
 [%hardbreaks]
-Copyright (C) 2022-2023 Intel Corporation.  All rights reserved.
+Copyright (C) 2022-2024 Intel Corporation.  All rights reserved.
 
 Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
 of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
@@ -383,9 +383,11 @@ and work-groups to also provide concurrent forward progress guarantees). In
 such a case, an implementation must satisfy the strongest request(s).
 
 Devices may not be able to provide the requested forward progress guarantees
-for all launch configurations.  The <<launch, launch queries>> defined in a
-later section allow developers to identify valid launch configurations for
-specific combinations of properties.
+for all launch configurations. Developers should use the launch queries defined
+by the
+link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[sycl_ext_oneapi_launch_queries]
+extension to identify valid launch configurations for specific combinations of
+properties.
 
 [NOTE]
 ====
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
index 0ef2126d86fa1..44eb1b376d113 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
@@ -182,7 +182,8 @@ inline constexpr use_root_sync_key::value_t use_root_sync;
 === The `root_group` class
 
 The `root_group` class implements all member functions common to the
-`sycl::group` and `sycl::sub_group` classes.
+`sycl::group` and `sycl::sub_group` classes and also contains own
+additional functions.
 
 [source,c++]
 ----
@@ -191,6 +192,13 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
+enum class execution_scope {
+  work_item,
+  sub_group,
+  work_group,
+  root_group,
+};
+
 template <int Dimensions>
 class root_group {
 public:
@@ -221,6 +229,31 @@ public:
 
   bool leader() const;
 
+  template <execution_scope Scope>
+  std::enable_if_t<(Scope == execution_scope::work_item ||
+                    Scope == execution_scope::work_group),
+                   id<Dimensions>>
+  get_id() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<Scope == execution_scope::sub_group, id<1>> get_id() const;
+
+  template <execution_scope Scope>
+  size_t get_linear_id() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<(Scope == execution_scope::work_item ||
+                    Scope == execution_scope::work_group),
+                   range<Dimensions>>
+  get_range() const;
+
+  template <execution_scope Scope>
+  std::enable_if_t<Scope == execution_scope::sub_group, range<1>>
+  get_range() const;
+
+  template <execution_scope Scope>
+  size_t get_linear_range() const;
+
 };
 
 } // namespace experimental
@@ -307,6 +340,67 @@ work-item is the leader of the root-group, and `false` for all other work-items
 in the root-group. The leader of the root-group is guaranteed to be the
 work-item for which `get_local_id()` returns 0.
 
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<(Scope == execution_scope::work_item ||
+                  Scope == execution_scope::work_group),
+                 id<Dimensions>>
+get_id() const;
+----
+_Returns_: An `id` representing the index of the current work-group or work-item at `Scope`
+hierarchy level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<Scope == execution_scope::sub_group, id<1>> get_id() const;
+----
+_Returns_: An `id` representing the index of the current sub-group within the
+`root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+size_t get_linear_id() const;
+----
+_Constraints_: `Scope` must be narrower than
+`execution_scope::root_group`.
+
+_Returns_: A linearized number of the current work-group or work-item at `Scope` hierarchy
+level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<(Scope == execution_scope::work_item ||
+                  Scope == execution_scope::work_group),
+                 range<Dimensions>>
+get_range() const;
+----
+_Returns_: A `range` representing the number of work-groups or work-items of `Scope`
+hierarchy level within the `root_group` object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+std::enable_if_t<Scope == execution_scope::sub_group, range<1>>
+get_range() const;
+----
+_Returns_: A `range` representing the number of sub-groups within the `root_group`
+object.
+
+[source,c++]
+----
+template <execution_scope Scope>
+size_t get_linear_range() const;
+----
+_Constraints_: `Scope` must be narrower than
+`execution_scope::root_group`.
+
+_Returns_: The number of work-groups or work-items of `Scope` hierarchy level within the
+`root_group` object.
+
 
 === Using a `root_group`
 
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index ea1a6580d30e6..9af5b7e75ae38 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -174,10 +174,11 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *
 __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *,
                             Ts val, size_t i);
 
-template <typename T, std::size_t NumRows, std::size_t NumCols>
-extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixPrefetchINTEL(
-    T *Ptr, std::size_t coordX, std::size_t coordY, unsigned int CacheLevel,
-    __spv::MatrixLayout Layout, std::size_t Stride);
+template <typename T>
+extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL(
+    T *Ptr, std::size_t coordX, std::size_t coordY, std::size_t NumRows,
+    std::size_t NumCols, unsigned int CacheLevel, __spv::MatrixLayout Layout,
+    std::size_t Stride);
 
 #ifndef __SPIRV_BUILTIN_DECLARATIONS__
 #error                                                                         \
diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp
index bc497c540693b..91fd50d2ec4f2 100644
--- a/sycl/include/sycl/builtins_preview.hpp
+++ b/sycl/include/sycl/builtins_preview.hpp
@@ -137,7 +137,14 @@ auto builtin_marray_impl(FuncTy F, const Ts &...x) {
   marray<ret_elem_type, T::size()> Res;
   constexpr auto N = T::size();
   for (size_t I = 0; I < N / 2; ++I) {
-    auto PartialRes = F(to_vec2(x, I * 2)...);
+    auto PartialRes = [&]() {
+      using elem_ty = get_elem_type_t<T>;
+      if constexpr (std::is_integral_v<elem_ty>)
+        return F(to_vec2(x, I * 2)
+                     .template as<vec<get_fixed_sized_int_t<elem_ty>, 2>>()...);
+      else
+        return F(to_vec2(x, I * 2)...);
+    }();
     std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes)));
   }
   if (N % 2)
@@ -175,12 +182,6 @@ auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) {
   }
 }
 
-template <typename T>
-struct any_elem_type
-    : std::bool_constant<check_type_in_v<
-          get_elem_type_t<T>, float, double, half, char, signed char, short,
-          int, long, long long, unsigned char, unsigned short, unsigned int,
-          unsigned long, unsigned long long>> {};
 template <typename T>
 struct fp_elem_type
     : std::bool_constant<
@@ -188,16 +189,6 @@ struct fp_elem_type
 template <typename T>
 struct float_elem_type
     : std::bool_constant<check_type_in_v<get_elem_type_t<T>, float>> {};
-template <typename T>
-struct integer_elem_type
-    : std::bool_constant<
-          check_type_in_v<get_elem_type_t<T>, char, signed char, short, int,
-                          long, long long, unsigned char, unsigned short,
-                          unsigned int, unsigned long, unsigned long long>> {};
-template <typename T>
-struct suint32_elem_type
-    : std::bool_constant<
-          check_type_in_v<get_elem_type_t<T>, int32_t, uint32_t>> {};
 
 template <typename... Ts>
 struct same_basic_shape : std::bool_constant<builtin_same_shape_v<Ts...>> {};
@@ -244,13 +235,6 @@ struct builtin_enable
                               SHAPE_CHECKER, EXTRA_CONDITIONS, Ts...>::type;   \
   }
 } // namespace detail
-
-BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type,
-                       any_shape, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type,
-                       any_elem_type, scalar_only, same_elem_type)
-BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type,
-                       any_elem_type, non_scalar_only, same_elem_type)
 } // namespace _V1
 } // namespace sycl
 
diff --git a/sycl/include/sycl/builtins_utils_scalar.hpp b/sycl/include/sycl/builtins_utils_scalar.hpp
index 62d86df045cdf..31f267892243b 100644
--- a/sycl/include/sycl/builtins_utils_scalar.hpp
+++ b/sycl/include/sycl/builtins_utils_scalar.hpp
@@ -128,6 +128,17 @@ template <size_t Size> struct get_unsigned_int_by_size {
 template <typename T> struct same_size_unsigned_int {
   using type = typename get_unsigned_int_by_size<sizeof(T)>::type;
 };
+template <typename T>
+using same_size_unsigned_int_t = typename same_size_unsigned_int<T>::type;
+
+template <typename T> struct get_fixed_sized_int {
+  static_assert(std::is_integral_v<T>);
+  using type =
+      std::conditional_t<std::is_signed_v<T>, same_size_signed_int_t<T>,
+                         same_size_unsigned_int_t<T>>;
+};
+template <typename T>
+using get_fixed_sized_int_t = typename get_fixed_sized_int<T>::type;
 
 // Utility trait for getting an upsampled integer type.
 // NOTE: For upsampling we look for an integer of double the size of the
diff --git a/sycl/include/sycl/detail/builtins/common_functions.inc b/sycl/include/sycl/detail/builtins/common_functions.inc
index fb10964934cf0..022cab78e51db 100644
--- a/sycl/include/sycl/detail/builtins/common_functions.inc
+++ b/sycl/include/sycl/detail/builtins/common_functions.inc
@@ -72,32 +72,15 @@ min(T x, detail::get_elem_type_t<T> y) {
              detail::simplify_if_swizzle_t<T>{y});
 }
 
-#undef BUILTIN_COMMON
-
-#ifdef __SYCL_DEVICE_ONLY__
-DEVICE_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t,
-                     [](auto... xs) {
-                       using ElemTy = detail::get_elem_type_t<T0>;
-                       if constexpr (std::is_integral_v<ElemTy>) {
-                         if constexpr (std::is_signed_v<ElemTy>) {
-                           return __spirv_ocl_s_clamp(xs...);
-                         } else {
-                           return __spirv_ocl_u_clamp(xs...);
-                         }
-                       } else {
-                         return __spirv_ocl_fclamp(xs...);
-                       }
-                     })
-#else
-HOST_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, common,
-                   default_ret_type)
-#endif
+BUILTIN_COMMON(THREE_ARGS, clamp, __spirv_ocl_fclamp)
 template <typename T>
-detail::builtin_enable_generic_non_scalar_t<T>
+detail::builtin_enable_common_non_scalar_t<T>
 clamp(T x, detail::get_elem_type_t<T> y, detail::get_elem_type_t<T> z) {
   return clamp(detail::simplify_if_swizzle_t<T>{x},
                detail::simplify_if_swizzle_t<T>{y},
                detail::simplify_if_swizzle_t<T>{z});
 }
+
+#undef BUILTIN_COMMON
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/helper_macros.hpp b/sycl/include/sycl/detail/builtins/helper_macros.hpp
index 49d4af8981d70..38014c4b62c8f 100644
--- a/sycl/include/sycl/detail/builtins/helper_macros.hpp
+++ b/sycl/include/sycl/detail/builtins/helper_macros.hpp
@@ -48,6 +48,11 @@
   FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
                ARG4, ARG5, ARG6)                                               \
   BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG7)
+#define FOR_EACH4_A8(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4, ARG5, ARG6, ARG7, ARG8)                       \
+  FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4, ARG5, ARG6, ARG7)                                         \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG8)
 #define FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,   \
                       ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11)  \
   FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
@@ -169,6 +174,9 @@
   unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long
 // 11 types
 #define INTEGER_TYPES SIGNED_TYPES, UNSIGNED_TYPES
+// 8 types
+#define FIXED_WIDTH_INTEGER_TYPES                                              \
+  int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t
 
 #define DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(                                  \
     NUM_ARGS, NAME, ENABLER, DELEGATOR, NS, /*SCALAR_VEC_IMPL*/...)            \
diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc
index dfeb815e52494..44699765ff7fd 100644
--- a/sycl/include/sycl/detail/builtins/integer_functions.inc
+++ b/sycl/include/sycl/detail/builtins/integer_functions.inc
@@ -12,6 +12,19 @@
 
 namespace sycl {
 inline namespace _V1 {
+namespace detail {
+template <typename T>
+struct integer_elem_type
+    : std::bool_constant<
+          (is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, FIXED_WIDTH_INTEGER_TYPES>) ||
+          (!is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, INTEGER_TYPES>)> {};
+template <typename T>
+struct suint32_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, int32_t, uint32_t>> {};
+} // namespace detail
 BUILTIN_CREATE_ENABLER(builtin_enable_integer, default_ret_type,
                        integer_elem_type, any_shape, same_elem_type)
 BUILTIN_CREATE_ENABLER(builtin_enable_integer_non_scalar, default_ret_type,
@@ -122,6 +135,15 @@ min(T x, detail::get_elem_type_t<T> y) {
              detail::simplify_if_swizzle_t<T>{y});
 }
 
+BUILTIN_GENINT_SU(THREE_ARGS, clamp)
+template <typename T>
+detail::builtin_enable_integer_non_scalar_t<T>
+clamp(T x, detail::get_elem_type_t<T> y, detail::get_elem_type_t<T> z) {
+  return clamp(detail::simplify_if_swizzle_t<T>{x},
+               detail::simplify_if_swizzle_t<T>{y},
+               detail::simplify_if_swizzle_t<T>{z});
+}
+
 BUILTIN_GENINT(ONE_ARG, clz)
 BUILTIN_GENINT(ONE_ARG, ctz)
 BUILTIN_GENINT(ONE_ARG, popcount)
diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
index d23f144676cdb..d63a7716aa9b7 100644
--- a/sycl/include/sycl/detail/builtins/relational_functions.inc
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -13,6 +13,15 @@
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
+template <typename T>
+struct bitselect_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, FP_TYPES> ||
+          (is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, FIXED_WIDTH_INTEGER_TYPES>) ||
+          (!is_vec_or_swizzle_v<T> &&
+           check_type_in_v<get_elem_type_t<T>, INTEGER_TYPES>)> {};
+
 template <typename T>
 struct rel_ret_traits
     : std::conditional<is_scalar_arithmetic_v<T>, bool,
@@ -21,6 +30,9 @@ struct rel_ret_traits
                            same_size_signed_int_t<simplify_if_swizzle_t<T>>>> {
 };
 } // namespace detail
+
+BUILTIN_CREATE_ENABLER(builtin_enable_bitselect, default_ret_type,
+                       bitselect_elem_type, any_shape, same_elem_type)
 BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type,
                        non_scalar_only, same_elem_type)
 
@@ -119,13 +131,14 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet)
 
 #ifdef __SYCL_DEVICE_ONLY__
 DEVICE_IMPL_TEMPLATE(
-    THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) {
-      using ret_ty = detail::builtin_enable_generic_t<THREE_ARGS_TEMPLATE_TYPE>;
+    THREE_ARGS, bitselect, builtin_enable_bitselect_t, [](auto... xs) {
+      using ret_ty =
+          detail::builtin_enable_bitselect_t<THREE_ARGS_TEMPLATE_TYPE>;
       using detail::builtins::convert_result;
       return convert_result<ret_ty>(__spirv_ocl_bitselect(xs...));
     })
 #else
-HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel,
+HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_bitselect_t, rel,
                    default_ret_type)
 #endif
 
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index d723d03353cb2..c9d0d6cc4d410 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_slm_gather_ld(
     __ESIMD_DNS::simd_mask_storage_t<N> pred,
     __ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;
 
+// Scatter data to given global or private addresses.
+template <typename T, int N, size_t Align>
+__ESIMD_INTRIN void
+__esimd_scatter_st(__ESIMD_DNS::vector_type_t<T, N> vals,
+                   __ESIMD_DNS::vector_type_t<uint64_t, N> vptr,
+                   __ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;
+
+// Scatter data to given SLM addresses.
+template <typename T, int N, size_t Align>
+__ESIMD_INTRIN void __esimd_slm_scatter_st(
+    __ESIMD_DNS::vector_type_t<T, N> vals,
+    __ESIMD_DNS::vector_type_t<uint32_t, N> vptr,
+    __ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;
+
 /// Surface-based gather.
 /// Supported platforms: DG2, PVC
 ///
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index d923821027339..4ffd5c418138c 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -724,11 +724,20 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 
   // Use LSC lowering if L1/L2 or VS > 1.
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
-                VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                VS > 1 ||
+                (!__ESIMD_DNS::isPowerOf2(N, 32) &&
+                 !detail::isMaskedGatherScatterLLVMAvailable())) {
     static_assert(VS == 1 || sizeof(T) >= 4,
                   "VS > 1 is supprted only for 4- and 8-byte elements");
     return detail::scatter_impl<T, VS, detail::lsc_data_size::default_size,
                                 L1Hint, L2Hint>(p, byte_offsets, vals, mask);
+  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
+    simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
+    Addrs = Addrs + convert<uint64_t>(byte_offsets);
+    using MsgT = detail::__raw_t<T>;
+    __esimd_scatter_st<MsgT, N, Alignment>(
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
+        Addrs.data(), mask.data());
   } else {
     using Tx = detail::__raw_t<T>;
     simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
@@ -2685,11 +2694,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
 namespace detail {
 template <typename T, int N, typename AccessorTy>
 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
-    (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
-    (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
-     is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
+    std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
+    is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
              uint32_t glob_offset, simd_mask<N> mask) {
+
+  static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
+                "Unexpected type or vector length");
   constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
   // TODO (performance) use hardware-supported scale once BE supports it
   constexpr int16_t scale = 0;
@@ -2820,10 +2831,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 /// @return is a vector of type T and size N * NElts.
 ///
 template <typename T, int NElts, lsc_data_size DS, int N>
-__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
-slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
-                __ESIMD_NS::simd_mask<N> pred,
-                __ESIMD_NS::simd<T, N * NElts> pass_thru) {
+__ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
+                                               simd_mask<N> pred,
+                                               simd<T, N * NElts> pass_thru) {
   check_lsc_vector_size<NElts>();
   check_lsc_data_size<T, DS>();
   constexpr uint16_t AddressScale = 1;
@@ -2832,9 +2842,8 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
-      lsc_format_input<MsgT>(pass_thru);
-  __ESIMD_NS::simd<MsgT, N * NElts> Result =
+  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
+  simd<MsgT, N * NElts> Result =
       __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
                                  AddressScale, ImmOffset, EDS, LSCVS,
                                  Transposed, N>(pred.data(), offsets.data(),
@@ -2842,6 +2851,37 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   return lsc_format_ret<T>(Result);
 }
 
+/// SLM scatter implementation.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.slm
+///
+/// Scatters elements located to slm.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam N is the number of channels (platform dependent).
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, int N>
+__ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
+                                  simd<T, N * NElts> vals, simd_mask<N> pred) {
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
+  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
+                        ImmOffset, EDS, LSCVS, Transposed, N>(
+      pred.data(), offsets.data(), Tmp.data());
+}
+
 } // namespace detail
 
 /// @endcond ESIMD_DETAIL
@@ -3903,7 +3943,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
   static_assert(Alignment >= sizeof(T),
                 "slm_gather() requires at least element-size alignment");
 
-  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
+  if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
                            !detail::isMaskedGatherScatterLLVMAvailable())) {
     simd<T, N> PassThru; // Intentionally undefined
     return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
@@ -4118,7 +4158,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// @param props The optional compile-time properties. Only 'alignment'
-/// and cache hint properties are used.
+/// property is used.
 /// @return A vector of elements read.
 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
           typename PropertyListT =
@@ -4141,17 +4181,162 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
   return Res[0];
 }
 
-/// Scatter operation over the Shared Local Memory.
-/// This API has almost the same interface as the @ref accessor_scatter
-/// "accessor-based scatter", except that it does not have the accessor and
-/// the global offset parameters.
-///
-template <typename T, int N>
-__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
-                             (sizeof(T) <= 4)>
-slm_scatter(simd<uint32_t, N> offsets, simd<T, N> vals, simd_mask<N> mask = 1) {
-  detail::LocalAccessorMarker acc;
-  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                  simd<T, N> vals, simd_mask<N / VS> mask,
+///                  PropertyListT props = {});                   // (slm-sc-1)
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, PropertyListT props = {});  // (slm-sc-2)
+///
+/// The next 2 functions are variations of the first 2 above (slm-sc-1,2)
+/// and were added only to support simd_view instead of simd for byte_offsets.
+/// template <typename T, int N, int VS = 1, typename OffsetObjT,
+///           typename OffsetRegionT, typename PropertyListT = empty_props_t>
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
+///             simd<T, N> vals, simd_mask<N / VS> mask,
+///             PropertyListT props = {});                         // (slm-sc-3)
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
+///             simd<T, N> vals, PropertyListT props = {});        // (slm-sc-4)
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                   // (slm-sc-1)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets. Storage of any element
+/// can be disabled via the input vector of predicates \p mask.
+/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment' property
+/// is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
+
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "slm_scatter() requires at least element-size alignment");
+
+  // Use LSC lowering if VS > 1.
+  if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
+                           !detail::isMaskedGatherScatterLLVMAvailable())) {
+    __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
+        byte_offsets, vals, mask);
+  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
+    using MsgT = detail::__raw_t<T>;
+    __esimd_slm_scatter_st<MsgT, N, Alignment>(
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
+        byte_offsets.data(), mask.data());
+  } else {
+    detail::LocalAccessorMarker acc;
+    detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
+  }
+}
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+///                   PropertyListT props = {});                   // (slm-sc-2)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_props_t>
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// Storage to any element's memory location can be disabled via the
+/// input vector of predicates \p mask. If mask[i] is unset, then the storage to
+/// (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
+}
+
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             PropertyListT props = {});                         // (slm-sc-4)
+/// Loads ("gathers") elements of the type 'T' from Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets, and returns the loaded
+/// elements.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
 }
 
 /// Store a scalar value into the Shared Local Memory.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 7fb0ae833e793..9d5054b4392ba 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1424,21 +1424,7 @@ template <typename T, int NElts = 1,
 __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
                                  __ESIMD_NS::simd<T, N * NElts> vals,
                                  __ESIMD_NS::simd_mask<N> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
-  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
-                        _ImmOffset, _DS, _VS, _Transposed, N>(
-      pred.data(), offsets.data(), Tmp.data());
+  __ESIMD_DNS::slm_scatter_impl<T, NElts, DS>(offsets, vals, pred);
 }
 
 /// Transposed SLM scatter with 1 channel.
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index 98aea6f04a48b..a07e9c144ba6a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -524,8 +524,9 @@ joint_matrix_prefetch(Group sg, T *Ptr, size_t stride,
   // Will be removed once SPIRV implementation also uses offsetpointer
   size_t coordX = 0;
   size_t coordY = 0;
-  __spirv_JointMatrixPrefetchINTEL<T, NumRows, NumCols>(
-      Ptr, coordX, coordY, detail::PropertyMetaInfo<decltype(prop)>::value,
+  __spirv_CooperativeMatrixPrefetchINTEL<T>(
+      Ptr, coordX, coordY, NumRows, NumCols,
+      detail::PropertyMetaInfo<decltype(prop)>::value,
       sycl::detail::joint_matrix_layout_to_spv(Layout), stride);
 #endif // defined(__NVPTX__)
 #else
diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index a2f934a147c33..c12b9e6781eb9 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -987,6 +987,41 @@ template <typename Type, int NumElements> class vec {
 #endif
 
 #ifdef __SYCL_USE_EXT_VECTOR_TYPE__
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
+  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
+    vec Ret;                                                                   \
+    if constexpr (IsUsingArrayOnDevice) {                                      \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I)));              \
+      }                                                                        \
+    } else {                                                                   \
+      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
+      if constexpr (std::is_same_v<Type, bool> && CONVERT) {                   \
+        Ret.ConvertToDataT();                                                  \
+      }                                                                        \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
+    return Lhs BINOP vec(Rhs);                                                 \
+  }                                                                            \
+  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
+    return vec(Lhs) BINOP Rhs;                                                 \
+  }                                                                            \
+  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <int Num = NumElements>                                             \
+  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
+      vec & Lhs, const DataT & Rhs) {                                          \
+    Lhs = Lhs BINOP vec(Rhs);                                                  \
+    return Lhs;                                                                \
+  }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
   template <typename Ty = vec>                                                 \
   vec operator BINOP(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {     \
@@ -1024,38 +1059,37 @@ template <typename Type, int NumElements> class vec {
     *this = *this BINOP vec(Rhs);                                              \
     return *this;                                                              \
   }
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
 #else // __SYCL_USE_EXT_VECTOR_TYPE__
 
 #if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  vec operator BINOP(const vec &Rhs) const {                                   \
+  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
     vec Ret{};                                                                 \
     if constexpr (NativeVec)                                                   \
-      Ret.m_Data = m_Data BINOP Rhs.m_Data;                                    \
+      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
     else                                                                       \
       for (size_t I = 0; I < NumElements; ++I)                                 \
-        Ret.setValue(I, (DataT)(vec_data<DataT>::get(getValue(                 \
+        Ret.setValue(I, (DataT)(vec_data<DataT>::get(Lhs.getValue(             \
                             I)) BINOP vec_data<DataT>::get(Rhs.getValue(I)))); \
     return Ret;                                                                \
   }                                                                            \
-  template <typename T>                                                        \
-  typename std::enable_if_t<                                                   \
-      std::is_convertible_v<DataT, T> &&                                       \
-          (std::is_fundamental_v<vec_data_t<T>> ||                             \
-           detail::is_half_or_bf16_v<typename std::remove_const_t<T>>),        \
-      vec>                                                                     \
-  operator BINOP(const T & Rhs) const {                                        \
-    return *this BINOP vec(static_cast<const DataT &>(Rhs));                   \
+  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
+    return Lhs BINOP vec(Rhs);                                                 \
   }                                                                            \
-  vec &operator OPASSIGN(const vec & Rhs) {                                    \
-    *this = *this BINOP Rhs;                                                   \
-    return *this;                                                              \
+  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
+    return vec(Lhs) BINOP Rhs;                                                 \
+  }                                                                            \
+  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
   }                                                                            \
   template <int Num = NumElements>                                             \
-  typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(                \
-      const DataT & Rhs) {                                                     \
-    *this = *this BINOP vec(Rhs);                                              \
-    return *this;                                                              \
+  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
+      vec & Lhs, const DataT & Rhs) {                                          \
+    Lhs = Lhs BINOP vec(Rhs);                                                  \
+    return Lhs;                                                                \
   }
 #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
@@ -1120,6 +1154,42 @@ template <typename Type, int NumElements> class vec {
 // Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
 // by SYCL device compiler only.
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const vec & Rhs) {          \
+    vec<rel_t, NumElements> Ret{};                                             \
+    /* This special case is needed since there are no standard operator||   */ \
+    /* or operator&& functions for std::array.                              */ \
+    if constexpr (IsUsingArrayOnDevice &&                                      \
+                  (std::string_view(#RELLOGOP) == "||" ||                      \
+                   std::string_view(#RELLOGOP) == "&&")) {                     \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        /* We cannot use SetValue here as the operator is not a friend of*/    \
+        /* Ret on Windows. */                                                  \
+        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
+            Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I)))); \
+      }                                                                        \
+    } else {                                                                   \
+      Ret = vec<rel_t, NumElements>(                                           \
+          (typename vec<rel_t, NumElements>::vector_t)(                        \
+              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
+      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
+        Ret *= -1;                                                             \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const DataT & Rhs) {        \
+    return Lhs RELLOGOP vec(Rhs);                                              \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
+                                                   const vec & Rhs) {          \
+    return vec(Lhs) RELLOGOP Rhs;                                              \
+  }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_RELLOGOP(RELLOGOP)                                              \
   vec<rel_t, NumElements> operator RELLOGOP(const vec & Rhs) const {           \
     vec<rel_t, NumElements> Ret{};                                             \
@@ -1129,9 +1199,10 @@ template <typename Type, int NumElements> class vec {
                   (std::string_view(#RELLOGOP) == "||" ||                      \
                    std::string_view(#RELLOGOP) == "&&")) {                     \
       for (size_t I = 0; I < NumElements; ++I) {                               \
-        Ret.setValue(I,                                                        \
-                     -(vec_data<DataT>::get(getValue(I))                       \
-                           RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
+        /* We cannot use SetValue here as the operator is not a friend of*/    \
+        /* Ret on Windows. */                                                  \
+        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
+            getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));     \
       }                                                                        \
     } else {                                                                   \
       Ret = vec<rel_t, NumElements>(                                           \
@@ -1150,13 +1221,38 @@ template <typename Type, int NumElements> class vec {
   operator RELLOGOP(const T & Rhs) const {                                     \
     return *this RELLOGOP vec(static_cast<const DataT &>(Rhs));                \
   }
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #else
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const vec & Rhs) {          \
+    vec<rel_t, NumElements> Ret{};                                             \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      /* We cannot use SetValue here as the operator is not a friend of*/      \
+      /* Ret on Windows. */                                                    \
+      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
+          Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
+                                                   const DataT & Rhs) {        \
+    return Lhs RELLOGOP vec(Rhs);                                              \
+  }                                                                            \
+  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
+                                                   const vec & Rhs) {          \
+    return vec(Lhs) RELLOGOP Rhs;                                              \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_RELLOGOP(RELLOGOP)                                              \
   vec<rel_t, NumElements> operator RELLOGOP(const vec & Rhs) const {           \
     vec<rel_t, NumElements> Ret{};                                             \
     for (size_t I = 0; I < NumElements; ++I) {                                 \
-      Ret.setValue(I, -(vec_data<DataT>::get(getValue(I))                      \
-                            RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));  \
+      /* We cannot use SetValue here as the operator is not a friend of*/      \
+      /* Ret on Windows. */                                                    \
+      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
+          getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));       \
     }                                                                          \
     return Ret;                                                                \
   }                                                                            \
@@ -1168,6 +1264,7 @@ template <typename Type, int NumElements> class vec {
   operator RELLOGOP(const T & Rhs) const {                                     \
     return *this RELLOGOP vec(static_cast<const DataT &>(Rhs));                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #endif
 
   __SYCL_RELLOGOP(==)
@@ -1184,6 +1281,18 @@ template <typename Type, int NumElements> class vec {
 #ifdef __SYCL_UOP
 #error "Undefine __SYCL_UOP macro"
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend vec &operator UOP(vec & Rhs) {                                        \
+    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec operator UOP(vec &Lhs, int) {                                     \
+    vec Ret(Lhs);                                                              \
+    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    return Ret;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_UOP(UOP, OPASSIGN)                                              \
   vec &operator UOP() {                                                        \
     *this OPASSIGN vec_data<DataT>::get(1);                                    \
@@ -1194,6 +1303,7 @@ template <typename Type, int NumElements> class vec {
     *this OPASSIGN vec_data<DataT>::get(1);                                    \
     return Ret;                                                                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_UOP(++, +=)
   __SYCL_UOP(--, -=)
@@ -1203,150 +1313,84 @@ template <typename Type, int NumElements> class vec {
 
   // operator~() available only when: dataT != float && dataT != double
   // && dataT != half
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
-                                (!IsUsingArrayOnDevice && !IsUsingArrayOnHost),
-                            vec>
-  operator~() const {
-    vec Ret{(typename vec::DataType) ~m_Data};
-    if constexpr (std::is_same_v<Type, bool>) {
-      Ret.ConvertToDataT();
-    }
-    return Ret;
-  }
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec>
-  operator~() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I) {
-      Ret.setValue(I, ~getValue(I));
+  friend vec operator~(const vec &Rhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
+      vec Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
+        Ret.setValue(I, ~Rhs.getValue(I));
+      }
+      return Ret;
+    } else {
+      vec Ret{(typename vec::DataType) ~Rhs.m_Data};
+      if constexpr (std::is_same_v<Type, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
     }
-    return Ret;
   }
 
-  template <typename T>
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-  using OpNotRet = detail::rel_t<T>;
-#else
-  using OpNotRet = T;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-
   // operator!
-  template <typename T = DataT, int N = NumElements>
-  EnableIfNotUsingArray<vec<OpNotRet<T>, N>> operator!() const {
-    return vec<T, N>{(typename vec<DataT, NumElements>::DataType) !m_Data}
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-        .template as<vec<OpNotRet<T>, N>>();
-#else
-    ;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-
-  // std::byte neither supports ! unary op or casting, so special handling is
-  // needed. And, worse, Windows has a conflict with 'byte'.
+  friend vec<detail::rel_t<DataT>, NumElements> operator!(const vec &Rhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
+      vec Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
 #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-  template <typename T = DataT, int N = NumElements>
-  typename std::enable_if_t<std::is_same_v<std::byte, T> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec<OpNotRet<T>, N>>
-  operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I) {
-      Ret.setValue(I, std::byte{!vec_data<DataT>::get(getValue(I))});
+        // std::byte neither supports ! unary op or casting, so special handling
+        // is needed. And, worse, Windows has a conflict with 'byte'.
+        if constexpr (std::is_same_v<std::byte, DataT>) {
+          Ret.setValue(I, std::byte{!vec_data<DataT>::get(Rhs.getValue(I))});
+        } else
+#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+        {
+          Ret.setValue(I, !vec_data<DataT>::get(Rhs.getValue(I)));
+        }
+      }
+      return Ret.template as<vec<detail::rel_t<DataT>, NumElements>>();
+    } else {
+      return vec{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
+          .template as<vec<detail::rel_t<DataT>, NumElements>>();
     }
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
   }
 
-  template <typename T = DataT, int N = NumElements>
-  typename std::enable_if_t<!std::is_same_v<std::byte, T> &&
-                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
-                            vec<OpNotRet<T>, N>>
-  operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-#else
-  template <typename T = DataT, int N = NumElements>
-  EnableIfUsingArray<vec<OpNotRet<T>, N>> operator!() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
-#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
-    return Ret.template as<vec<OpNotRet<T>, N>>();
-#else
-    return Ret;
-#endif // __INTEL_PREVIEW_BREAKING_CHANGES
-  }
-#endif
-
   // operator +
-  template <typename T = vec> EnableIfNotUsingArray<T> operator+() const {
-    return vec{+m_Data};
-  }
-
-  template <typename T = vec> EnableIfUsingArray<T> operator+() const {
-    vec Ret{};
-    for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, vec_data<DataT>::get(+vec_data<DataT>::get(getValue(I))));
-    return Ret;
-  }
-
-  // operator -
-  template <typename T = vec> EnableIfNotUsingArray<T> operator-() const {
-    namespace oneapi = sycl::ext::oneapi;
-    if constexpr (IsBfloat16 && NumElements == 1) {
-      vec Ret{};
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data);
-      oneapi::bfloat16 w = -v;
-      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
-    } else if constexpr (IsBfloat16) {
+  friend vec operator+(const vec &Lhs) {
+    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
       vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]);
-        oneapi::bfloat16 w = -v;
-        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
-      }
+      for (size_t I = 0; I < NumElements; ++I)
+        Ret.setValue(
+            I, vec_data<DataT>::get(+vec_data<DataT>::get(Lhs.getValue(I))));
       return Ret;
     } else {
-      vec Ret{-m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
+      return vec{+Lhs.m_Data};
     }
   }
 
-  template <typename T = vec> EnableIfUsingArray<T> operator-() const {
+  // operator -
+  friend vec operator-(const vec &Lhs) {
     namespace oneapi = sycl::ext::oneapi;
     vec Ret{};
     if constexpr (IsBfloat16 && NumElements == 1) {
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data);
+      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
       oneapi::bfloat16 w = -v;
       Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
     } else if constexpr (IsBfloat16) {
       for (size_t I = 0; I < NumElements; I++) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]);
+        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
         oneapi::bfloat16 w = -v;
         Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
       }
-    } else {
+    } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
       for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(I,
-                     vec_data<DataT>::get(-vec_data<DataT>::get(getValue(I))));
+        Ret.setValue(
+            I, vec_data<DataT>::get(-vec_data<DataT>::get(Lhs.getValue(I))));
+      return Ret;
+    } else {
+      Ret = vec{-Lhs.m_Data};
+      if constexpr (std::is_same_v<Type, bool>) {
+        Ret.ConvertToDataT();
+      }
+      return Ret;
     }
-    return Ret;
   }
 #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
@@ -1770,6 +1814,19 @@ class SwizzleOp {
 #ifdef __SYCL_OPASSIGN
 #error "Undefine __SYCL_OPASSIGN macro."
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_OPASSIGN(OPASSIGN, OP)                                          \
+  friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, const DataT & Rhs) {    \
+    Lhs.operatorHelper<OP>(vec_t(Rhs));                                        \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <typename RhsOperation>                                             \
+  friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs,                         \
+                                      const RhsOperation & Rhs) {              \
+    Lhs.operatorHelper<OP>(Rhs);                                               \
+    return Lhs;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_OPASSIGN(OPASSIGN, OP)                                          \
   SwizzleOp &operator OPASSIGN(const DataT & Rhs) {                            \
     operatorHelper<OP>(vec_t(Rhs));                                            \
@@ -1780,6 +1837,7 @@ class SwizzleOp {
     operatorHelper<OP>(Rhs);                                                   \
     return *this;                                                              \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_OPASSIGN(+=, std::plus)
   __SYCL_OPASSIGN(-=, std::minus)
@@ -1796,6 +1854,18 @@ class SwizzleOp {
 #ifdef __SYCL_UOP
 #error "Undefine __SYCL_UOP macro"
 #endif
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend SwizzleOp &operator UOP(SwizzleOp & Rhs) {                            \
+    Rhs OPASSIGN static_cast<DataT>(1);                                        \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec_t operator UOP(SwizzleOp &Lhs, int) {                             \
+    vec_t Ret = Lhs;                                                           \
+    Lhs OPASSIGN static_cast<DataT>(1);                                        \
+    return Ret;                                                                \
+  }
+#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 #define __SYCL_UOP(UOP, OPASSIGN)                                              \
   SwizzleOp &operator UOP() {                                                  \
     *this OPASSIGN static_cast<DataT>(1);                                      \
@@ -1806,11 +1876,36 @@ class SwizzleOp {
     *this OPASSIGN static_cast<DataT>(1);                                      \
     return Ret;                                                                \
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   __SYCL_UOP(++, +=)
   __SYCL_UOP(--, -=)
 #undef __SYCL_UOP
 
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+  template <typename T = DataT>
+  friend typename std::enable_if_t<
+      std::is_same_v<T, DataT> && std::is_integral_v<vec_data_t<T>>, vec_t>
+  operator~(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return ~Tmp;
+  }
+
+  friend vec_rel_t operator!(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return !Tmp;
+  }
+
+  friend vec_t operator+(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return +Tmp;
+  }
+
+  friend vec_t operator-(const SwizzleOp &Rhs) {
+    vec_t Tmp = Rhs;
+    return -Tmp;
+  }
+#else  // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
   template <typename T = DataT>
   typename std::enable_if_t<std::is_integral_v<vec_data_t<T>>, vec_t>
   operator~() {
@@ -1832,6 +1927,80 @@ class SwizzleOp {
     vec_t Tmp = *this;
     return -Tmp;
   }
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+
+#if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
+// scalar BINOP vec<>
+// scalar BINOP SwizzleOp
+// vec<> BINOP SwizzleOp
+#ifdef __SYCL_BINOP
+#error "Undefine __SYCL_BINOP macro"
+#endif
+#define __SYCL_BINOP(BINOP)                                                    \
+  friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) {        \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs BINOP Tmp;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) {        \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp BINOP Rhs;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) {        \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs BINOP Tmp;                                                      \
+  }                                                                            \
+  friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) {        \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp BINOP Rhs;                                                      \
+  }
+
+  __SYCL_BINOP(+)
+  __SYCL_BINOP(-)
+  __SYCL_BINOP(*)
+  __SYCL_BINOP(/)
+  __SYCL_BINOP(%)
+  __SYCL_BINOP(&)
+  __SYCL_BINOP(|)
+  __SYCL_BINOP(^)
+  __SYCL_BINOP(>>)
+  __SYCL_BINOP(<<)
+#undef __SYCL_BINOP
+
+// scalar RELLOGOP vec<>
+// scalar RELLOGOP SwizzleOp
+// vec<> RELLOGOP SwizzleOp
+#ifdef __SYCL_RELLOGOP
+#error "Undefine __SYCL_RELLOGOP macro"
+#endif
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp RELLOGOP Rhs;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \
+    vec_t Tmp = Rhs;                                                           \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }                                                                            \
+  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \
+    vec_t Tmp = Lhs;                                                           \
+    return Tmp RELLOGOP Rhs;                                                   \
+  }
+
+  __SYCL_RELLOGOP(==)
+  __SYCL_RELLOGOP(!=)
+  __SYCL_RELLOGOP(>)
+  __SYCL_RELLOGOP(<)
+  __SYCL_RELLOGOP(>=)
+  __SYCL_RELLOGOP(<=)
+  // TODO: limit to integral types.
+  __SYCL_RELLOGOP(&&)
+  __SYCL_RELLOGOP(||)
+#undef __SYCL_RELLOGOP
+#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
   template <int IdxNum = getNumElements(),
             typename = EnableIfMultipleIndexes<IdxNum>>
@@ -2265,6 +2434,7 @@ class SwizzleOp {
 };
 } // namespace detail
 
+#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 // scalar BINOP vec<>
 // scalar BINOP SwizzleOp
 // vec<> BINOP SwizzleOp
@@ -2374,6 +2544,7 @@ __SYCL_RELLOGOP(<=)
 __SYCL_RELLOGOP(&&)
 __SYCL_RELLOGOP(||)
 #undef __SYCL_RELLOGOP
+#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
 
 namespace detail {
 
diff --git a/sycl/include/sycl/usm.hpp b/sycl/include/sycl/usm.hpp
index f9ab634edc81a..ff941e865dff8 100644
--- a/sycl/include/sycl/usm.hpp
+++ b/sycl/include/sycl/usm.hpp
@@ -154,6 +154,13 @@ __SYCL_EXPORT void *aligned_alloc(
     const property_list &propList,
     const detail::code_location &CodeLoc = detail::code_location::current());
 
+///
+// Helper function used to determine if the Alignment argument is a power of 2
+///
+inline size_t is_not_power_of_two(size_t Alignment) {
+  return (Alignment & (Alignment - 1));
+}
+
 ///
 // Template forms
 ///
@@ -179,6 +186,9 @@ T *aligned_alloc_device(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_device(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -189,6 +199,9 @@ T *aligned_alloc_device(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_device<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
@@ -230,6 +243,9 @@ T *aligned_alloc_host(
     size_t Alignment, size_t Count, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_host(std ::max(Alignment, alignof(T)),
                                              Count * sizeof(T), Ctxt, PropList,
                                              CodeLoc));
@@ -240,6 +256,9 @@ T *aligned_alloc_host(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_host<T>(Alignment, Count, Q.get_context(), PropList,
                                CodeLoc);
 }
@@ -249,6 +268,9 @@ T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc_shared(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -259,6 +281,9 @@ T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const queue &Q,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc_shared<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
@@ -286,6 +311,9 @@ T *aligned_alloc(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
     usm::alloc Kind, const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return static_cast<T *>(aligned_alloc(max(Alignment, alignof(T)),
                                         Count * sizeof(T), Dev, Ctxt, Kind,
                                         PropList, CodeLoc));
@@ -296,6 +324,9 @@ T *aligned_alloc(
     size_t Alignment, size_t Count, const queue &Q, usm::alloc Kind,
     const property_list &PropList = {},
     const detail::code_location &CodeLoc = detail::code_location::current()) {
+  if (is_not_power_of_two(Alignment)) {
+    return nullptr;
+  }
   return aligned_alloc<T>(Alignment, Count, Q.get_device(), Q.get_context(),
                           Kind, PropList, CodeLoc);
 }
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 6a91815bc890f..2ea2b2f37622a 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 1cd402ead34a54459a6acb85777fbec105a178a0
-  # Merge: 5b89ee8b c8e150c5
+  # commit 9babc4d092a92c1036791d26ef328e5eeaf19803
+  # Merge: 3be8f205 90498ec5
   # Author: aarongreig <aaron.greig@codeplay.com>
-  # Date:   Tue Feb 6 14:48:55 2024 +0000
-  #     Merge pull request #1218 from Bensuo/maxime/imm-cmd-list-support
-  #     [EXP][CMDBUF] L0 Immediate command-list support
-  set(UNIFIED_RUNTIME_TAG 1cd402ead34a54459a6acb85777fbec105a178a0)
+  # Date:   Thu Feb 8 15:44:54 2024 +0000
+  #     Merge pull request #1321 from pbalcer/adapter-compute-constructor
+  #     [L0] move adapter init into its constructor from urAdapterGet
+  set(UNIFIED_RUNTIME_TAG 9babc4d092a92c1036791d26ef328e5eeaf19803)
   
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/source/builtins/common_functions.cpp b/sycl/source/builtins/common_functions.cpp
index 09742649ca24d..30d0645f2b517 100644
--- a/sycl/source/builtins/common_functions.cpp
+++ b/sycl/source/builtins/common_functions.cpp
@@ -63,16 +63,8 @@ BUILTIN_COMMON(TWO_ARGS, max,
 BUILTIN_COMMON(TWO_ARGS, min,
                [](auto x, auto y) -> decltype(x) { return (y < x ? y : x); })
 
-// clamp is implemented for INTEGER_TYPES as well, so expand/inline
-// BUILTIN_COMMON manually.
-HOST_IMPL(clamp, [](auto x, auto y, auto z) -> decltype(x) {
-  using ElemTy = detail::get_elem_type_t<decltype(x)>;
-  if constexpr (std::is_integral_v<ElemTy>) {
-    return std::min(std::max(x, y), z);
-  } else {
-    return std::fmin(std::fmax(x, y), z);
-  }
+BUILTIN_COMMON(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) {
+  return std::fmin(std::fmax(x, y), z);
 })
-EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, clamp, INTEGER_TYPES, FP_TYPES)
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/builtins/host_helper_macros.hpp b/sycl/source/builtins/host_helper_macros.hpp
index 484b0bc95fb8b..41aac2148db71 100644
--- a/sycl/source/builtins/host_helper_macros.hpp
+++ b/sycl/source/builtins/host_helper_macros.hpp
@@ -56,6 +56,9 @@
 #define EXPORT_VEC(NUM_ARGS, NAME, TYPE, VL)                                   \
   EXPORT_VEC_NS(NUM_ARGS, NAME, sycl, TYPE, VL)
 
+#define EXPORT_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE)                         \
+  FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE)
+
 #define EXPORT_SCALAR_AND_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE)              \
   EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE)                                   \
   FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE)
@@ -69,8 +72,12 @@
 
 #define EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...)                 \
   FOR_EACH3(EXPORT_SCALAR_AND_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__)
+#define EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...)                            \
+  FOR_EACH3(EXPORT_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__)
 #define EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, ...)                        \
   EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__)
+#define EXPORT_VEC_1_16(NUM_ARGS, NAME, ...)                                   \
+  EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__)
 
 #define EXPORT_SCALAR_AND_VEC_2_4(NUM_ARGS, NAME, ...)                         \
   FOR_EACH2(EXPORT_SCALAR_AND_VEC_2_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__)
diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp
index 26c4dd9a5788f..cd92b2180df73 100644
--- a/sycl/source/builtins/integer_functions.cpp
+++ b/sycl/source/builtins/integer_functions.cpp
@@ -76,7 +76,8 @@ namespace sycl {
 inline namespace _V1 {
 #define BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)                                   \
   HOST_IMPL(NAME, IMPL)                                                        \
-  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, INTEGER_TYPES)
+  FOR_EACH2(EXPORT_SCALAR, NUM_ARGS, NAME, INTEGER_TYPES)                      \
+  EXPORT_VEC_1_16(NUM_ARGS, NAME, FIXED_WIDTH_INTEGER_TYPES)
 #define BUILTIN_GENINT_SU(NUM_ARGS, NAME, IMPL)                                \
   BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)
 
@@ -214,6 +215,10 @@ BUILTIN_GENINT_SU(TWO_ARGS, max,
 BUILTIN_GENINT_SU(TWO_ARGS, min,
                   [](auto x, auto y) -> decltype(x) { return y < x ? y : x; })
 
+BUILTIN_GENINT_SU(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) {
+  return std::min(std::max(x, y), z);
+})
+
 template <typename T> static inline constexpr T __clz_impl(T x, T m, T n = 0) {
   return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n);
 }
diff --git a/sycl/source/builtins/relational_functions.cpp b/sycl/source/builtins/relational_functions.cpp
index b54c55e283e5e..b8b7795f6fb79 100644
--- a/sycl/source/builtins/relational_functions.cpp
+++ b/sycl/source/builtins/relational_functions.cpp
@@ -103,6 +103,7 @@ HOST_IMPL(bitselect, [](auto x, auto y, auto z) {
   assert((ures & std::numeric_limits<utype>::max()) == ures);
   return bit_cast<T0>(static_cast<utype>(ures));
 })
-EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES)
+FOR_EACH2(EXPORT_SCALAR, THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES)
+EXPORT_VEC_1_16(THREE_ARGS, bitselect, FIXED_WIDTH_INTEGER_TYPES, FP_TYPES)
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp
index 881a014c4831f..83309ec9f2d92 100644
--- a/sycl/source/detail/allowlist.cpp
+++ b/sycl/source/detail/allowlist.cpp
@@ -166,7 +166,8 @@ AllowListParsedT parseAllowList(const std::string &AllowListRaw) {
         // valid. E.g., for BackendName key, the allowed values are only ones
         // described in SyclBeMap
         ValidateEnumValues(BackendNameKeyName, getSyclBeMap());
-        ValidateEnumValues(DeviceTypeKeyName, getSyclDeviceTypeMap());
+        ValidateEnumValues(DeviceTypeKeyName,
+                           getSyclDeviceTypeMap(true /*Enable 'acc'*/));
 
         if (Key == DeviceVendorIdKeyName) {
           // DeviceVendorId should have hex format
@@ -380,7 +381,8 @@ void applyAllowList(std::vector<sycl::detail::pi::PiDevice> &PiDevices,
         Device, PI_DEVICE_INFO_TYPE, sizeof(sycl::detail::pi::PiDeviceType),
         &PiDevType, nullptr);
     sycl::info::device_type DeviceType = pi::cast<info::device_type>(PiDevType);
-    for (const auto &SyclDeviceType : getSyclDeviceTypeMap()) {
+    for (const auto &SyclDeviceType :
+         getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
       if (SyclDeviceType.second == DeviceType) {
         const auto &DeviceTypeValue = SyclDeviceType.first;
         DeviceDesc[DeviceTypeKeyName] = DeviceTypeValue;
diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp
index f7760aa227168..7ae96d42e220d 100644
--- a/sycl/source/detail/config.cpp
+++ b/sycl/source/detail/config.cpp
@@ -165,14 +165,16 @@ void dumpConfig() {
 // TODO: host device type will be removed once sycl_ext_oneapi_filter_selector
 // is removed.
 const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap() {
+getSyclDeviceTypeMap(bool supportAcc) {
   static const std::array<std::pair<std::string, info::device_type>, 6>
-      SyclDeviceTypeMap = {{{"host", info::device_type::host},
-                            {"cpu", info::device_type::cpu},
-                            {"gpu", info::device_type::gpu},
-                            {"acc", info::device_type::accelerator},
-                            {"fpga", info::device_type::accelerator},
-                            {"*", info::device_type::all}}};
+      SyclDeviceTypeMap = {
+          {{"host", info::device_type::host},
+           {"cpu", info::device_type::cpu},
+           {"gpu", info::device_type::gpu},
+           /* Duplicate entries are fine as this map is one-directional.*/
+           {supportAcc ? "acc" : "fpga", info::device_type::accelerator},
+           {"fpga", info::device_type::accelerator},
+           {"*", info::device_type::all}}};
   return SyclDeviceTypeMap;
 }
 
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 8f048e0f95f60..1079f32caa388 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -232,8 +232,10 @@ template <> class SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS> {
 };
 
 // Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR.
+// The 'supportAcc' parameter is used by SYCL_DEVICE_ALLOWLIST which,
+// unlike ONEAPI_DEVICE_SELECTOR, also accepts 'acc' as a valid device type.
 const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap();
+getSyclDeviceTypeMap(bool supportAcc = false);
 
 // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and
 // ONEAPI_DEVICE_SELECTOR
@@ -514,7 +516,7 @@ template <> class SYCLConfig<SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE> {
       return Result;
 
     std::string ValueStr{ValueRaw};
-    auto DeviceTypeMap = getSyclDeviceTypeMap();
+    auto DeviceTypeMap = getSyclDeviceTypeMap(true /*Enable 'acc'*/);
 
     // Iterate over all configurations.
     size_t Start = 0, End = 0;
diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp
index 311ebeaa174b8..eb3d0f83ed26e 100644
--- a/sycl/source/detail/device_filter.cpp
+++ b/sycl/source/detail/device_filter.cpp
@@ -93,9 +93,13 @@ static void Parse_ODS_Device(ods_target &Target,
   std::string_view TopDeviceStr = DeviceSubTuple[0];
 
   // Handle explicit device type (e.g. 'gpu').
-  auto DeviceTypeMap =
-      getSyclDeviceTypeMap(); // <-- std::array<std::pair<std::string,
-                              // info::device::type>>
+  auto DeviceTypeMap = getSyclDeviceTypeMap(
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+      true /*Enable 'acc'*/
+#endif
+  ); // <-- std::array<std::pair<std::string,
+     // info::device::type>>
+
   auto It =
       std::find_if(std::begin(DeviceTypeMap), std::end(DeviceTypeMap),
                    [&](auto DtPair) { return TopDeviceStr == DtPair.first; });
@@ -262,7 +266,11 @@ Parse_ONEAPI_DEVICE_SELECTOR(const std::string &envString) {
 std::ostream &operator<<(std::ostream &Out, const ods_target &Target) {
   Out << Target.Backend;
   if (Target.DeviceType) {
-    auto DeviceTypeMap = getSyclDeviceTypeMap();
+    auto DeviceTypeMap = getSyclDeviceTypeMap(
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+        true /*Enable 'acc'*/
+#endif
+    );
     auto Match = std::find_if(
         DeviceTypeMap.begin(), DeviceTypeMap.end(),
         [&](auto Pair) { return (Pair.second == Target.DeviceType); });
@@ -335,11 +343,12 @@ device_filter::device_filter(const std::string &FilterString) {
   if (TripleValueID >= Tokens.size()) {
     DeviceType = info::device_type::all;
   } else {
-    auto Iter = std::find_if(std::begin(getSyclDeviceTypeMap()),
-                             std::end(getSyclDeviceTypeMap()), FindElement);
+    auto Iter = std::find_if(
+        std::begin(getSyclDeviceTypeMap(true /*Enable 'acc'*/)),
+        std::end(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), FindElement);
     // If no match is found, set device_type 'all',
     // which actually means 'any device_type' will be a match.
-    if (Iter == getSyclDeviceTypeMap().end())
+    if (Iter == getSyclDeviceTypeMap(true /*Enable 'acc'*/).end())
       DeviceType = info::device_type::all;
     else {
       DeviceType = Iter->second;
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index ba50f0562ff56..a24905d4214da 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -342,7 +342,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
     // handler rather than by-passing the scheduler.
     if (!MGraph.lock() &&
         areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
-      if (MHasDiscardEventsSupport) {
+      if (MSupportsDiscardingPiEvents) {
         MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
                   /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
         return createDiscardedEvent();
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index ddd6a71d7db80..0fe4242cc9472 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -115,8 +115,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     if (has_property<property::queue::enable_profiling>()) {
@@ -292,8 +292,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     queue_impl_interop(PiQueue);
@@ -317,8 +317,8 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MHasDiscardEventsSupport(MDiscardEvents &&
-                                 (MHostQueue ? true : MIsInorder)) {
+        MSupportsDiscardingPiEvents(MDiscardEvents &&
+                                    (MHostQueue ? true : MIsInorder)) {
     queue_impl_interop(PiQueue);
   }
 
@@ -374,7 +374,9 @@ class queue_impl {
   bool is_host() const { return MHostQueue; }
 
   /// \return true if this queue has discard_events support.
-  bool has_discard_events_support() const { return MHasDiscardEventsSupport; }
+  bool supportsDiscardingPiEvents() const {
+    return MSupportsDiscardingPiEvents;
+  }
 
   bool isInOrder() const { return MIsInorder; }
 
@@ -970,12 +972,11 @@ class queue_impl {
   const bool MIsProfilingEnabled;
 
 protected:
-  // This flag says if we can discard events based on a queue "setup" which will
-  // be common for all operations submitted to the queue. This is a must
-  // condition for discarding, but even if it's true, in some cases, we won't be
-  // able to discard events, because the final decision is made right before the
-  // operation itself.
-  const bool MHasDiscardEventsSupport;
+  // Indicates whether the queue supports discarding PI events for tasks
+  // submitted to it. This condition is necessary but not sufficient, PI events
+  // should be discarded only if they also don't represent potential implicit
+  // dependencies for future tasks in other queues.
+  const bool MSupportsDiscardingPiEvents;
 
   // Command graph which is associated with this queue for the purposes of
   // recording commands to it.
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 23b6eec33886e..955adae8423dc 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2704,7 +2704,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
   }
 
   sycl::detail::pi::PiEvent *Event =
-      (MQueue->has_discard_events_support() &&
+      (MQueue->supportsDiscardingPiEvents() &&
        MCommandGroup->getRequirements().size() == 0)
           ? nullptr
           : &MEvent->getHandleRef();
@@ -2851,11 +2851,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   auto RawEvents = getPiEvents(EventImpls);
   flushCrossQueueDeps(EventImpls, getWorkerQueue());
 
-  bool DiscardEvent = (MQueue->has_discard_events_support() &&
-                       MCommandGroup->getRequirements().size() == 0);
+  bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() &&
+                         MCommandGroup->getRequirements().size() == 0);
   sycl::detail::pi::PiEvent *Event =
-      DiscardEvent ? nullptr : &MEvent->getHandleRef();
-  detail::EventImplPtr EventImpl = DiscardEvent ? nullptr : MEvent;
+      DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
+  detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
 
   switch (MCommandGroup->getType()) {
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 02ffef951d1b5..70681a7504358 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -292,7 +292,7 @@ event handler::finalize() {
       };
 
       bool DiscardEvent = false;
-      if (MQueue->has_discard_events_support()) {
+      if (MQueue->supportsDiscardingPiEvents()) {
         // Kernel only uses assert if it's non interop one
         bool KernelUsesAssert =
             !(MKernel && MKernel->isInterop()) &&
diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
index 8fad623752873..ea572bfc4c5d7 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
+// REQUIRES: aspect-usm_shared_allocations
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
index 7e4a35d09b994..c478fd96afb34 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
+// REQUIRES: aspect-usm_shared_allocations
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
index 812a0bb115737..8be1db398a1a1 100644
--- a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
@@ -124,26 +124,27 @@ template <typename T> void testUsmKind(sycl::queue &q) {
           [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); },
           [&]() { return ATHost(1, q); }, [&]() { return ATHost(1, Ctx); },
           [&]() { return ATAnnotated(1, dev, Ctx, alloc::host); }});
-
-  CheckUsmKindAll(
-      alloc::shared,
-      std::tuple{
-          [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); },
-          [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
-          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
-          [&]() { return AShared(1, q); },
-          [&]() { return AShared(1, dev, Ctx); },
-          [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); },
-          [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); },
-          [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
-          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
-          [&]() { return ATShared(1, q); },
-          [&]() { return ATShared(1, dev, Ctx); },
-          [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }});
+  if (dev.has(sycl::aspect::usm_shared_allocations)) {
+    CheckUsmKindAll(
+        alloc::shared,
+        std::tuple{
+            [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); },
+            [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
+            [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+            [&]() { return AShared(1, q); },
+            [&]() { return AShared(1, dev, Ctx); },
+            [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); },
+            [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); },
+            [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
+            [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+            [&]() { return ATShared(1, q); },
+            [&]() { return ATShared(1, dev, Ctx); },
+            [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }});
+  }
 }
 
 int main() {
   sycl::queue q;
   testUsmKind<int>(q);
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sycl/test-e2e/Basic/group_local_memory.cpp b/sycl/test-e2e/Basic/group_local_memory.cpp
index 8185378071c75..b5bb26917ec0d 100644
--- a/sycl/test-e2e/Basic/group_local_memory.cpp
+++ b/sycl/test-e2e/Basic/group_local_memory.cpp
@@ -7,17 +7,21 @@ constexpr int N = 5;
 
 int main() {
   sycl::queue q;
-  int *ptr = sycl::malloc_shared<int>(N, q);
-  q.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) {
-     auto g = it.get_group();
-     auto mem = sycl::ext::oneapi::group_local_memory<int[N]>(g, 1, 2, 3, 4, 5);
-     auto ref = *mem;
-     for (int i = 0; i < N; ++i) {
-       ptr[i] = ref[i];
-     }
-   }).wait();
+  sycl::buffer<int> buf{sycl::range{N}};
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor acc{buf, h};
+    h.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) {
+      auto g = it.get_group();
+      auto mem =
+          sycl::ext::oneapi::group_local_memory<int[N]>(g, 1, 2, 3, 4, 5);
+      auto ref = *mem;
+      for (int i = 0; i < N; ++i) {
+        acc[i] = ref[i];
+      }
+    });
+  });
+  sycl::host_accessor result{buf};
   for (int i = 0; i < N; ++i) {
-    assert(ptr[i] == (i + 1));
+    assert(result[i] == (i + 1));
   }
-  sycl::free(ptr, q);
 }
diff --git a/sycl/test-e2e/Basic/large-range.cpp b/sycl/test-e2e/Basic/large-range.cpp
index 354276065a739..f2c38cbb5dd8d 100644
--- a/sycl/test-e2e/Basic/large-range.cpp
+++ b/sycl/test-e2e/Basic/large-range.cpp
@@ -26,15 +26,18 @@ void check_sum(std::string_view desc, const ContainerT &data, size_t N) {
 template <typename RangeT>
 void test_regular(std::string_view desc, queue &q, size_t B, RangeT range) {
   auto N = range.size();
-  std::vector accumulators_v(B, 0, usm_allocator<int, usm::alloc::shared>(q));
-  auto *accumulators = accumulators_v.data();
-
-  q.parallel_for(range, [=](auto it) {
-     atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
-         accumulators[it.get_linear_id() % B]);
-     ++ref;
-   }).wait();
-
+  std::vector accumulators_v(B, 0);
+  {
+    sycl::buffer accumulator_buf{accumulators_v};
+    q.submit([&](sycl::handler &h) {
+      sycl::accessor accumulators{accumulator_buf, h};
+      h.parallel_for(range, [=](auto it) {
+        atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
+            accumulators[it.get_linear_id() % B]);
+        ++ref;
+      });
+    });
+  } // destruction of accumulator_buf here writes back data to accumulators_v
   check_sum(desc, accumulators_v, N);
 }
 
@@ -42,18 +45,19 @@ template <typename RangeT>
 void test_spec_constant(std::string_view desc, queue &q, size_t B,
                         RangeT range) {
   auto N = range.size();
-  std::vector accumulators_v(B, 0, usm_allocator<int, usm::alloc::shared>(q));
-  auto *accumulators = accumulators_v.data();
-
-  q.submit([&](handler &cgh) {
-     cgh.set_specialization_constant<C>(2);
-     cgh.parallel_for(range, [=](auto it, kernel_handler h) {
-       atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
-           accumulators[it.get_linear_id() % B]);
-       ref += h.get_specialization_constant<C>();
-     });
-   }).wait();
-
+  std::vector accumulators_v(B, 0);
+  {
+    sycl::buffer accumulators_buf{accumulators_v};
+    q.submit([&](handler &cgh) {
+      sycl::accessor accumulators{accumulators_buf, cgh};
+      cgh.set_specialization_constant<C>(2);
+      cgh.parallel_for(range, [=](auto it, kernel_handler h) {
+        atomic_ref<int, memory_order::relaxed, memory_scope::device> ref(
+            accumulators[it.get_linear_id() % B]);
+        ref += h.get_specialization_constant<C>();
+      });
+    });
+  } // destruction of accumulators_buf here writes data back to accumulators_v
   check_sum(desc, accumulators_v, N * 2);
 }
 
diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp
index 43d16ff4d16d6..ec341d39d0eac 100644
--- a/sycl/test-e2e/Basic/span.cpp
+++ b/sycl/test-e2e/Basic/span.cpp
@@ -3,7 +3,7 @@
 //
 // Fails to release USM pointer on HIP for NVIDIA
 // XFAIL: hip_nvidia
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <numeric>
 #include <sycl/sycl.hpp>
 
diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp
new file mode 100644
index 0000000000000..158b52ab5f27e
--- /dev/null
+++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp
@@ -0,0 +1,150 @@
+// REQUIRES: preview-breaking-changes-supported
+// RUN: %{build} -fpreview-breaking-changes -o %t.out
+// RUN: %{run} %t.out
+
+// This test currently fails on AMD HIP due to an unresolved memcmp function.
+// XFAIL: hip_amd
+
+// Checks scalar/vec operator ordering.
+
+#include <sycl.hpp>
+
+template <typename T>
+using rel_t = std::conditional_t<
+    sizeof(T) == 1, int8_t,
+    std::conditional_t<
+        sizeof(T) == 2, int16_t,
+        std::conditional_t<sizeof(T) == 4, int32_t,
+                           std::conditional_t<sizeof(T) == 8, int64_t, void>>>>;
+
+template <bool IsRelOp, typename T1, int N, typename T2>
+bool CheckResult(sycl::vec<T1, N> V, T2 Ref) {
+  if constexpr (IsRelOp) {
+    // Check that all elements have the same boolean representation as the
+    // scalar.
+    for (size_t I = 0; I < N; ++I)
+      if (static_cast<bool>(V[I]) != static_cast<bool>(Ref))
+        return false;
+    return true;
+  } else {
+    // Check that all elements are equal to the scalar.
+    for (size_t I = 0; I < N; ++I)
+      if (V[I] != Ref)
+        return false;
+    return true;
+  }
+}
+
+#define CHECK(Q, C, T, N, IS_RELOP, OP)                                        \
+  {                                                                            \
+    using VecT = sycl::vec<T, N>;                                              \
+    using ResT = sycl::vec<std::conditional_t<IS_RELOP, rel_t<T>, T>, N>;      \
+    constexpr T RefVal = 2;                                                    \
+    VecT InVec{static_cast<T>(RefVal)};                                        \
+    {                                                                          \
+      VecT OutVecsDevice[2];                                                   \
+      T OutRefsDevice[2];                                                      \
+      {                                                                        \
+        sycl::buffer<VecT, 1> OutVecsBuff{OutVecsDevice, 2};                   \
+        sycl::buffer<T, 1> OutRefsBuff{OutRefsDevice, 2};                      \
+        Q.submit([&](sycl::handler &CGH) {                                     \
+          sycl::accessor OutVecsAcc{OutVecsBuff, CGH, sycl::read_write};       \
+          sycl::accessor OutRefsAcc{OutRefsBuff, CGH, sycl::read_write};       \
+          CGH.single_task([=]() {                                              \
+            auto OutVec1 = InVec OP RefVal;                                    \
+            auto OutVec2 = RefVal OP InVec;                                    \
+            static_assert(std::is_same_v<decltype(OutVec1), ResT>);            \
+            static_assert(std::is_same_v<decltype(OutVec2), ResT>);            \
+            OutVecsAcc[0] = OutVec1;                                           \
+            OutVecsAcc[1] = OutVec2;                                           \
+            OutRefsAcc[0] = RefVal OP RefVal;                                  \
+            OutRefsAcc[1] = RefVal OP RefVal;                                  \
+          });                                                                  \
+        });                                                                    \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVecsDevice[0], OutRefsDevice[0])) {        \
+        std::cout << ("Check of vector " #OP                                   \
+                      " scalar from device failed for " #T " and " #N)         \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVecsDevice[1], OutRefsDevice[1])) {        \
+        std::cout << ("Check of scalar " #OP                                   \
+                      " vector from device failed for " #T " and " #N)         \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+    }                                                                          \
+    {                                                                          \
+      auto OutVec1 = InVec OP RefVal;                                          \
+      auto OutVec2 = RefVal OP InVec;                                          \
+      static_assert(std::is_same_v<decltype(OutVec1), ResT>);                  \
+      static_assert(std::is_same_v<decltype(OutVec2), ResT>);                  \
+      if (!CheckResult<IS_RELOP>(OutVec1, RefVal OP RefVal)) {                 \
+        std::cout << ("Check of vector " #OP                                   \
+                      " scalar from host failed for " #T " and " #N)           \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+      if (!CheckResult<IS_RELOP>(OutVec2, RefVal OP RefVal)) {                 \
+        std::cout << ("Check of scalar " #OP                                   \
+                      " vector from host failed for " #T " and " #N)           \
+                  << std::endl;                                                \
+        ++C;                                                                   \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define CHECK_SIZES(Q, C, T, IS_RELOP, OP)                                     \
+  CHECK(Q, C, T, 1, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 2, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 4, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 8, IS_RELOP, OP)                                              \
+  CHECK(Q, C, T, 16, IS_RELOP, OP)
+
+// NOTE: For the sake of compile-time we pick only a few operators per category.
+#define CHECK_SIZES_AND_COMMON_OPS(Q, C, T)                                    \
+  CHECK_SIZES(Q, Failures, T, false, *)                                        \
+  CHECK_SIZES(Q, Failures, T, true, &&)                                        \
+  CHECK_SIZES(Q, Failures, T, true, ==)                                        \
+  CHECK_SIZES(Q, Failures, T, true, <)                                         \
+  CHECK_SIZES(Q, Failures, T, true, >=)
+#define CHECK_SIZES_AND_INT_ONLY_OPS(Q, C, T)                                  \
+  CHECK_SIZES(Q, Failures, T, false, %)                                        \
+  CHECK_SIZES(Q, Failures, T, false, >>)                                       \
+  CHECK_SIZES(Q, Failures, T, false, ^)
+
+int main() {
+  sycl::queue Q;
+  int Failures = 0;
+
+  // Check operators on types with requirements if they are supported.
+  if (Q.get_device().has(sycl::aspect::fp16)) {
+    CHECK_SIZES_AND_COMMON_OPS(Q, Failures, sycl::half);
+  }
+  if (Q.get_device().has(sycl::aspect::fp64)) {
+    CHECK_SIZES_AND_COMMON_OPS(Q, Failures, double);
+  }
+
+  // Check all operators without requirements.
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, float);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int8_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int16_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int32_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int64_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint8_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint16_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint32_t);
+  CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint64_t);
+
+  // Check integer only operators.
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int8_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int16_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int32_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int64_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint8_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint16_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint32_t);
+  CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint64_t);
+  return Failures;
+}
diff --git a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
index 7b81493db6dce..d2fabb6f7e967 100644
--- a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
+++ b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 //==---------- wrapped_usm_pointer.cpp - test pointers in struct ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
index 9bb5122ffc7f7..1178bdf0b5ffe 100644
--- a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp
@@ -13,31 +13,26 @@
     bool operator()(sycl::queue &Q, cmplx<T> init,                             \
                     cmplx<T> ref = cmplx<T>(0, 0), bool use_ref = false) {     \
       bool pass = true;                                                        \
-                                                                               \
       auto std_in = init_std_complex(init.re, init.im);                        \
       experimental::complex<T> cplx_input{init.re, init.im};                   \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       /*Get std::complex output*/                                              \
       std::complex<T> std_out{ref.re, ref.im};                                 \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<T>(cplx_input);                 \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<T>(cplx_input); });  \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<T>(cplx_input);                    \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<T>(cplx_input);                \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -73,8 +68,7 @@ TEST_MATH_OP_TYPE(tanh)
                                                                                \
       auto std_in = init_std_complex(init.re, init.im);                        \
       experimental::complex<T> cplx_input{init.re, init.im};                   \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<T>(1, Q);                           \
+      sycl::buffer<T> cplx_out_buf{sycl::range{1}};                            \
                                                                                \
       /*Get std::complex output*/                                              \
       T std_out = ref.re;                                                      \
@@ -82,19 +76,18 @@ TEST_MATH_OP_TYPE(tanh)
         std_out = std::math_func(std_in);                                      \
                                                                                \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<T>(cplx_input);                 \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<T>(cplx_input); });  \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<T>(cplx_input);                    \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<T>(cplx_input);                \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -121,23 +114,21 @@ TEST_MATH_OP_TYPE(imag)
       std::complex<T> std_out = ref;                                           \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<X>(std_in);                     \
-       }).wait();                                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<X>(std_in); });      \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<X>(std_in);                        \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<X>(std_in);                    \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -161,23 +152,21 @@ TEST_MATH_OP_TYPE(proj)
       T std_out = ref;                                                         \
       if (!use_ref)                                                            \
         std_out = std::math_func(std_in);                                      \
-                                                                               \
-      auto *cplx_out = sycl::malloc_shared<T>(1, Q);                           \
-                                                                               \
+      sycl::buffer<T> cplx_out_buf{sycl::range{1}};                            \
       /*Check cplx::complex output from device*/                               \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = experimental::math_func<X>(init);                       \
-       }).wait();                                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task(                                                         \
+            [=]() { cplx_out[0] = experimental::math_func<X>(std_in); });      \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
       /*Check cplx::complex output from host*/                                 \
-      cplx_out[0] = experimental::math_func<X>(init);                          \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = experimental::math_func<X>(init);                      \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -197,26 +186,25 @@ template <typename T> struct test_polar {
                   bool use_ref = false) {
     bool pass = true;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range(1)};
     /*Get std::complex output*/
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::polar(init.re, init.im);
 
     /*Check cplx::complex output from device*/
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::polar<T>(init.re, init.im);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::polar<T>(init.re, init.im); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     /*Check cplx::complex output from host*/
-    cplx_out[0] = experimental::polar<T>(init.re, init.im);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::polar<T>(init.re, init.im);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
diff --git a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
index ed68332f60bdd..8894f5d965462 100644
--- a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp
@@ -15,23 +15,19 @@
       experimental::complex<T> cplx_input1{init_re1, init_im1};                \
       experimental::complex<T> cplx_input2{init_re2, init_im2};                \
                                                                                \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       std::complex<T> std_out;                                                 \
       std_out = std_in1 op std_in2;                                            \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] = cplx_input1 op cplx_input2; });    \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
-      Q.single_task([=]() {                                                    \
-         cplx_out[0] = cplx_input1 op cplx_input2;                             \
-       }).wait();                                                              \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
-                                                                               \
-      cplx_out[0] = cplx_input1 op cplx_input2;                                \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
-                                                                               \
-      sycl::free(cplx_out, Q);                                                 \
+      cplx_out_acc[0] = cplx_input1 op cplx_input2;                            \
                                                                                \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
       return pass;                                                             \
     }                                                                          \
   };
@@ -53,28 +49,32 @@ test_op(test_div, /);
       experimental::complex<T> cplx_input{init_re1, init_im1};                 \
                                                                                \
       auto std_inout = init_std_complex(init_re2, init_im2);                   \
-      auto *cplx_inout = sycl::malloc_shared<experimental::complex<T>>(1, Q);  \
-      cplx_inout[0].real(init_re2);                                            \
-      cplx_inout[0].imag(init_im2);                                            \
-                                                                               \
+      experimental::complex<T> cplx;                                           \
+      cplx.real(init_re2);                                                     \
+      cplx.imag(init_im2);                                                     \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{&cplx,               \
+                                                          sycl::range{1}};     \
       std_inout op_assign std_in;                                              \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] op_assign cplx_input; });            \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
                                                                                \
-      Q.single_task([=]() { cplx_inout[0] op_assign cplx_input; }).wait();     \
+      pass &=                                                                  \
+          check_results(cplx_out_acc[0],                                       \
+                        std::complex<T>(std_inout.real(), std_inout.imag()),   \
+                        /*is_device*/ true);                                   \
                                                                                \
-      pass &= check_results(                                                   \
-          cplx_inout[0], std::complex<T>(std_inout.real(), std_inout.imag()),  \
-          /*is_device*/ true);                                                 \
+      cplx_out_acc[0].real(init_re2);                                          \
+      cplx_out_acc[0].imag(init_im2);                                          \
                                                                                \
-      cplx_inout[0].real(init_re2);                                            \
-      cplx_inout[0].imag(init_im2);                                            \
+      cplx_out_acc[0] op_assign cplx_input;                                    \
                                                                                \
-      cplx_inout[0] op_assign cplx_input;                                      \
-                                                                               \
-      pass &= check_results(                                                   \
-          cplx_inout[0], std::complex<T>(std_inout.real(), std_inout.imag()),  \
-          /*is_device*/ false);                                                \
-                                                                               \
-      sycl::free(cplx_inout, Q);                                               \
+      pass &=                                                                  \
+          check_results(cplx_out_acc[0],                                       \
+                        std::complex<T>(std_inout.real(), std_inout.imag()),   \
+                        /*is_device*/ false);                                  \
                                                                                \
       return pass;                                                             \
     }                                                                          \
@@ -99,19 +99,18 @@ test_op_assign(test_div_assign, /=);
       experimental::complex<T> cplx_input{init_re1, init_im1};                 \
                                                                                \
       std::complex<T> std_out{};                                               \
-      auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);    \
-                                                                               \
+      sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};     \
       std_out = op std_in;                                                     \
+      Q.submit([&](sycl::handler &h) {                                         \
+        sycl::accessor cplx_out{cplx_out_buf, h};                              \
+        h.single_task([=]() { cplx_out[0] = op cplx_input; });                 \
+      });                                                                      \
+      sycl::host_accessor cplx_out_acc{cplx_out_buf};                          \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);     \
                                                                                \
-      Q.single_task([=]() { cplx_out[0] = op cplx_input; }).wait();            \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);         \
-                                                                               \
-      cplx_out[0] = op cplx_input;                                             \
-                                                                               \
-      pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);        \
+      cplx_out_acc[0] = op cplx_input;                                         \
                                                                                \
-      sycl::free(cplx_out, Q);                                                 \
+      pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);    \
                                                                                \
       return pass;                                                             \
     }                                                                          \
diff --git a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
index c8555687eb49b..91d58a4cdeebb 100644
--- a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp
@@ -51,26 +51,26 @@ template <typename T> struct test_pow_cplx_cplx {
     experimental::complex<T> cplx_input1{init1.re, init1.im};
     experimental::complex<T> cplx_input2{init2.re, init2.im};
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_in1, std_in2);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task([=]() {
+        cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
+      });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow<T>(cplx_input1, cplx_input2);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::pow<T>(cplx_input1, cplx_input2);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
@@ -87,26 +87,25 @@ template <typename T> struct test_pow_cplx_deci {
     experimental::complex<T> cplx_input{init1.re, init1.im};
     T deci_input = init2.re;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_in, std_deci_in);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow(cplx_input, deci_input);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::pow(cplx_input, deci_input); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
 
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow(cplx_input, deci_input);
+    cplx_out_acc[0] = experimental::pow(cplx_input, deci_input);
 
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
-
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
@@ -123,26 +122,24 @@ template <typename T> struct test_pow_deci_cplx {
     experimental::complex<T> cplx_input{init2.re, init2.im};
     T deci_input = init1.re;
 
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{sycl::range{1}};
     // Get std::complex output
     std::complex<T> std_out{ref.re, ref.im};
     if (!use_ref)
       std_out = std::pow(std_deci_in, std_in);
 
     // Check cplx::complex output from device
-    Q.single_task([=]() {
-       cplx_out[0] = experimental::pow(deci_input, cplx_input);
-     }).wait();
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ true);
-
+    Q.submit([&](sycl::handler &h) {
+      sycl::accessor cplx_out{cplx_out_buf, h};
+      h.single_task(
+          [=]() { cplx_out[0] = experimental::pow(deci_input, cplx_input); });
+    });
+    sycl::host_accessor cplx_out_acc{cplx_out_buf};
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true);
     // Check cplx::complex output from host
-    cplx_out[0] = experimental::pow(deci_input, cplx_input);
-
-    pass &= check_results(cplx_out[0], std_out, /*is_device*/ false);
+    cplx_out_acc[0] = experimental::pow(deci_input, cplx_input);
 
-    sycl::free(cplx_out, Q);
+    pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false);
 
     return pass;
   }
diff --git a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
index b9cb8c519366d..d8d645f6ac7dc 100644
--- a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
+++ b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp
@@ -7,17 +7,15 @@
 
 template <typename T> struct test_sycl_stream_operator {
   bool operator()(sycl::queue &Q, cmplx<T> init) {
-    auto *cplx_out = sycl::malloc_shared<experimental::complex<T>>(1, Q);
-    cplx_out[0] = experimental::complex<T>(init.re, init.im);
-
+    experimental::complex<T> cplx(init.re, init.im);
+    sycl::buffer<experimental::complex<T>> cplx_out_buf{&cplx, sycl::range{1}};
     Q.submit([&](sycl::handler &CGH) {
+       sycl::accessor cplx_out{cplx_out_buf, CGH};
        sycl::stream Out(512, 20, CGH);
        CGH.parallel_for<>(sycl::range<1>(1), [=](sycl::id<1> idx) {
          Out << cplx_out[idx] << sycl::endl;
        });
      }).wait();
-
-    sycl::free(cplx_out, Q);
     return true;
   }
 };
diff --git a/sycl/test-e2e/ESIMD/aot_mixed.cpp b/sycl/test-e2e/ESIMD/aot_mixed.cpp
index 32c10969ee51b..5204d413e69ad 100644
--- a/sycl/test-e2e/ESIMD/aot_mixed.cpp
+++ b/sycl/test-e2e/ESIMD/aot_mixed.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 // TODO: Enable on other GPUs once internal ticket is fixed
 // REQUIRES: ocloc && gpu-intel-gen12
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s
 // RUN: %{run} %t.sycl.out
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.out %s
 // RUN: %{run} %t.out
 
 // This test checks the following ESIMD ahead-of-time compilation scenarios:
diff --git a/sycl/test-e2e/ESIMD/api/ballot.cpp b/sycl/test-e2e/ESIMD/api/ballot.cpp
index a636bcee147e0..4f87e562972db 100644
--- a/sycl/test-e2e/ESIMD/api/ballot.cpp
+++ b/sycl/test-e2e/ESIMD/api/ballot.cpp
@@ -71,15 +71,13 @@ template <class T, int N> bool test(queue &Q) {
 template <class T> bool test(queue &Q) {
   bool Pass = true;
 
-  // TODO: uncomment calls below once simd<...>.copy_from() starts supporting
-  // sizes other than 8, 16 and 32.
-  //  Pass &= test<T, 4>(Q);
+  Pass &= test<T, 4>(Q);
   Pass &= test<T, 8>(Q);
-  //  Pass &= test<T, 12>(Q);
+  Pass &= test<T, 12>(Q);
   Pass &= test<T, 16>(Q);
-  //  Pass &= test<T, 20>(Q);
-  //  Pass &= test<T, 24>(Q);
-  //  Pass &= test<T, 28>(Q);
+  Pass &= test<T, 20>(Q);
+  Pass &= test<T, 24>(Q);
+  Pass &= test<T, 28>(Q);
   Pass &= test<T, 32>(Q);
 
   return Pass;
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
index 3d50a0915afe1..4c9e528646545 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
@@ -94,7 +94,6 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
   static constexpr bool is_bf16 = T == dpas_argument_type::bf16;
   static constexpr bool is_tf32 = T == dpas_argument_type::tf32;
 
-  // TODO: support tf32 here.
   using type = std::conditional_t<
       is_sint, signed char,
       std::conditional_t<
@@ -149,7 +148,7 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col,
   ElemT *Vec = reinterpret_cast<ElemT *>(VVec);
 
   // 1. Find and read the target 'unsigned int' element.
-  // THe unpacked matrix has dimensions: NumRows*NumCols
+  // The unpacked matrix dimensions are NumRows*NumCols.
   constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize;
   int UnpackedLinearIndex = Row * NumCols + Col;
   int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT;
@@ -160,7 +159,6 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col,
   } else {
     ElemT TargetElem = Vec[PackedLinearIndex];
     // TargetElem has 2 or more elements in it. Need to extract one.
-    // TODO: for now assume that is the case only for 2 or 4-bit integers.
     assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type");
 
     unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize;
@@ -196,7 +194,6 @@ ReadT readFromHorizontallyPackedMatrix(void *VVec, int Row, int Col) {
     return static_cast<ReadT>(TargetElem);
   } else {
     // TargetElem has 2 or more elements in it. Need to extract one.
-    // TODO: for now assume that is the case only for 2 or 4-bit integers.
     assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type");
     unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize;
     unsigned int Mask = (static_cast<uint64_t>(1) << ElemBitSize) - 1;
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index 755c436f0b329..8ea1fcf4a08ad 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -230,3 +230,222 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
 
   return Passed;
 }
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testSLM(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  constexpr uint32_t Groups = 8;
+  constexpr uint32_t Threads = 1;
+  constexpr size_t Size = Groups * Threads * N;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "SLM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), Q));
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         uint16_t LocalID = ndi.get_local_id(0);
+         uint32_t GlobalElemOffset = GlobalID * N;
+         uint32_t LocalElemOffset = LocalID * N;
+
+         constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
+         slm_init<SLMSize>();
+
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<T, 8> InVec(Out + GlobalElemOffset + I);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             slm_scatter<T>(Offsets, InVec);
+           }
+         }
+         barrier();
+
+         simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                              VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+
+         simd<T, N> Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
+         Vals *= 2;
+
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView);
+             }
+           }
+         }
+         barrier();
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I++) {
+             simd<uint32_t, 1> Offsets(I * sizeof(T), sizeof(T));
+             simd<T, 1> OutVec = slm_gather<T>(Offsets);
+             OutVec.copy_to(Out + GlobalElemOffset + I);
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask);
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testSLM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // // Test scatter() without passing compile-time properties argument.
+  Passed &= testSLM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{alignment<sizeof(T)>};
+    Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testSLM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testSLM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
index 929d3c6fc04f7..ff331a421ccef 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
@@ -5,13 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===------------------------------------------------------------------===//
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
 // RUN: %{run} %t.out
 
 // The test verifies esimd::scatter() functions accepting USM pointer
 // and optional compile-time esimd::properties.
-// The scatter() calls in this test do not use cache-hint
-// properties to not impose using DG2/PVC features.
+// The scatter() calls in this test do not use cache-hint properties
+// or VS > 1 (number of stores per offset) to not impose using PVC features.
 
 #include "Inputs/scatter.hpp"
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
new file mode 100644
index 0000000000000..08ac29ba6b605
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
@@ -0,0 +1,21 @@
+//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test do not use cache-hint properties
+// or VS > 1 (number of stores per offset) to not impose using PVC features.
+//
+// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is
+// resolved and ESIMD starts using llvm.masked.scatter by default.
+// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here.
+
+#include "scatter_usm.cpp"
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
new file mode 100644
index 0000000000000..ffa0a718e7689
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
@@ -0,0 +1,33 @@
+//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting  optional
+// compile-time esimd::properties. The slm_scatter() calls in this test do not
+// use VS > 1 (number of stores per offset) to not impose using PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
new file mode 100644
index 0000000000000..4ebe20e376cf1
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
@@ -0,0 +1,37 @@
+//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting
+// optional compile-time esimd::properties.
+// The slm_scatter() calls in this test use DG2 or PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+  Passed &= testSLM<uint64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp
new file mode 100644
index 0000000000000..0aca53311ef6b
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp
@@ -0,0 +1,20 @@
+//==------- slm_scatter_legacy.cpp - DPC++ ESIMD on-device test -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting  optional
+// compile-time esimd::properties. The slm_scatter() calls in this test do not
+// use VS > 1 (number of stores per offset) to not impose using PVC features.
+//
+// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is
+// resolved and ESIMD starts using llvm.masked.scatter by default.
+// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here.
+
+#include "slm_scatter.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
index cf17e3b6c3dba..b39f3b4fc889b 100644
--- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
index 6e76ec42c079e..00940dcd6d68c 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
index 040ce8c30c38b..fd8fcec4b89d2 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
index bb12ff6006004..5855fc25ed4cc 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
index f989a27a1ca39..3be8febbe008e 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
index 7f7501175899c..301c53aa69795 100644
--- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp
index 167219bf4b8ef..62bcd167e484f 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task2.cpp b/sycl/test-e2e/Graph/Explicit/host_task2.cpp
index f7d59f48aac09..78be02a7bc991 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task2.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task2.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
index aed2916284cc1..26dcf148d6f7d 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
index 3d03b7f6013ad..32c7157d0958c 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
index 6fc8744e04c18..b2236b9a0d91b 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
index da26b8b182296..2147d075715af 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/memadvise.cpp b/sycl/test-e2e/Graph/Explicit/memadvise.cpp
index 11e0d6296290f..1a8313478ddb5 100644
--- a/sycl/test-e2e/Graph/Explicit/memadvise.cpp
+++ b/sycl/test-e2e/Graph/Explicit/memadvise.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // Mem advise command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Mem advise is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
index a83775b8ecb8a..e9c76ad01113b 100644
--- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/prefetch.cpp b/sycl/test-e2e/Graph/Explicit/prefetch.cpp
index 8e0e6e15c292d..4b670ebc89357 100644
--- a/sycl/test-e2e/Graph/Explicit/prefetch.cpp
+++ b/sycl/test-e2e/Graph/Explicit/prefetch.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // prefetch command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Prefetch is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
index a4f3710eea9f0..0760b5805e6d9 100644
--- a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp
@@ -4,6 +4,11 @@
 // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
index a2589cda54320..85d3b106bc3cd 100644
--- a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
index 8b1f6ba3d4455..935f431b09332 100644
--- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
index 830cc5a42a668..c60aa2f63e34f 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
index bc3cb1c8f352d..772c07599300f 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
index 92add086ee20d..17cdde9b87956 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
index 05ecfbd982a01..7fd72c79722d4 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
index 8fa8e0c334de2..d43e7e221e68c 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
index 323d3e35935ac..6b60911d4627e 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
@@ -7,7 +7,10 @@
 //
 //
 // USM copy command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
index 86076768f5d32..966809cf5a8d3 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
index e0100d42d7267..53feb52cad798 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
index e9bb9ebab0fa9..50821fd8d008b 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp
@@ -6,7 +6,10 @@
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
 // USM memset command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
index ba81666277e44..c0729ebb3a015 100644
--- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
@@ -7,7 +7,10 @@
 //
 //
 // Temporarily disabled until failure is addressed.
-// UNSUPPORTED: windows
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: windows, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
index 767ca8b425ab7..4a87a09e0fbce 100644
--- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // This test attempts recording a set of kernels after they have already been
 // executed once before.
diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
index c12038ba46185..951a016994261 100644
--- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #include "../graph_common.hpp"
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
index ffddf70716250..bcdfb702b78c7 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
index a90931934f0ce..b71fd0a690220 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
index 410850f5ff7a8..74bc18a7b3496 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
index 2f19c2706380b..51e4597354473 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
index 70128db712f78..2155b226e2fc3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
index 582a3fb3f14ff..4f4d50fe3b3f3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
index f947a13ffd63d..cd867eb4caadd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
index e40fe3a0e963c..c3037c78d8eaf 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // This test uses a host_task when adding a command_graph node to an
 // in-order queue.
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
index c8dfa02c63ea7..64d625197f089 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
index 7e312df58d092..a8ff9b53637cd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
index 6d86cef96c862..4e2730592ef95 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
index 51da588ab9444..e5570cb61a2d3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
index a5b87f8143832..1f7989f9a0361 100644
--- a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // Mem advise command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Mem advise is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
index 67b5335de4383..0709d25e225bd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
index dca297f7772b3..a5a18c08c6b66 100644
--- a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp
@@ -2,7 +2,10 @@
 // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %}
 
 // prefetch command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 // Since Prefetch is only a memory hint that doesn't
 // impact results but only performances, we verify
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
index aa81923251cb6..a88e462461a04 100644
--- a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp
@@ -4,6 +4,11 @@
 // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
index 99a28bd745853..d5e9520417a71 100644
--- a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
index badf7293f49cb..9e2416541e091 100644
--- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
index 2096a7c7a21f7..37e29e16edf40 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
index f1aeef51f1c31..b7442d24c6f1d 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
index d26f99a34c290..57b4c1cd3abdd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
index f2b7ca35a77d7..e78794733dc42 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
index a1d0373290cc6..8c3c5f0980c55 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
index 09486c4ebc8da..1fdf104310623 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
@@ -5,9 +5,11 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
-//
 // USM copy command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable the tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
index 8e89ecc693ed7..66a9589d9e242 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
index 401cdc7aa430f..ed959ca5a400b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
index 137b3df19cf02..86c533110f26b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp
@@ -6,7 +6,10 @@
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 
 // USM memset command not supported for OpenCL
-// UNSUPPORTED: opencl
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: opencl, gpu-intel-dg2
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp
index ff4984603d908..c574d65e9357f 100644
--- a/sycl/test-e2e/Graph/empty_graph.cpp
+++ b/sycl/test-e2e/Graph/empty_graph.cpp
@@ -5,6 +5,10 @@
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 //
+// Post-commit test failed
+// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479
+// Temporarily disable USM based tests while investigating the bug.
+// UNSUPPORTED: gpu-intel-dg2
 
 // Tests the ability to finalize and submit a command graph which doesn't
 // contain any nodes.
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
index 935df8cab7afa..a16b903c09a84 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
index cf21d2cd84714..ea7434dd31498 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
index 50f63b777d647..58978e203d8a8 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
index 053c48e53e353..98d9b1ff1520d 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
index 4754d32a57b77..47f0fd98311ae 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
index cf7ab9f6d8a6e..395eb4af68c03 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
index 4a24d6dbd441e..e36a15cf1cbf4 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
index ec6f1026d9bea..53b64cf2ba2fd 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
index b76357962f0fc..0ce42082fe2ef 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
index 3b2d8aa1353ee..86151cdc7cab7 100644
--- a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
+++ b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // UNSUPPORTED: ze_debug
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -13,7 +13,7 @@ struct KernelFunctor {
   void operator()(sycl::handler &cgh) {
     cgh.parallel_for<KernelFunctor>(
         sycl::range<1>{16},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n"
                        ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n"
@@ -25,6 +25,6 @@ struct KernelFunctor {
 
 int main() {
   KernelFunctor f;
-  launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true);
+  launchInlineASMTest(f, {16}, /* exception expected */ true);
   return 0;
 }
diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
index 71ca270f231cd..78321716777ac 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip_nvidia
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           C[wiID] = 43;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("");
@@ -30,7 +30,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 43))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
index c28616ee70750..00c3eb3830f38 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
index c28616ee70750..00c3eb3830f38 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
index 11d647cdfbe1d..812e30a7c9f56 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           for (int i = 0; i < 10; ++i) {
 #if defined(__SYCL_DEVICE_ONLY__)
             asm("fence_sw");
@@ -35,7 +35,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 45))
diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
index d43b9cd83ac81..26d0952225901 100644
--- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip_nvidia
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-8
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] {
           C[wiID] = 43;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("");
@@ -30,7 +30,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f, true, false, {8}))
+  if (!launchInlineASMTest(f, {8}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 43))
diff --git a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
index ab15d58164e3f..e671acc825e7b 100644
--- a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-8
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] {
           volatile int output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" : "=rw"(output));
@@ -33,7 +33,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f, true, false, {8}))
+  if (!launchInlineASMTest(f, {8}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
index 401dfbcacb63c..739feed41d6c9 100644
--- a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -33,7 +33,7 @@ struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mad (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0> %3(0, "
               "0)<1;1,0>"
@@ -56,7 +56,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB, inputC);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &D = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
index dd7cbc6739816..60d200d1c99df 100644
--- a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
     // declaration of temp within and outside the scope
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("{\n"
@@ -59,7 +59,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_add.cpp b/sycl/test-e2e/InlineAsm/asm_float_add.cpp
index bd4298b0ddad3..b6374c96d2010 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_add.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_add.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -30,7 +30,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
               : "=rw"(C[wiID])
@@ -51,7 +51,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
index 51cfa81149118..948d983554c4e 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -27,7 +27,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
               : "=rw"(B[wiID])
@@ -45,7 +45,7 @@ int main() {
     input[i] = (float)1 / std::pow(2, i);
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
index 39176ab355c96..43ad56e41222d 100644
--- a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mov (M1, 16) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
               : "=rw"(B[wiID])
@@ -45,7 +45,7 @@ int main() {
     input[i] = 1.0 / i;
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &R = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_if.cpp b/sycl/test-e2e/InlineAsm/asm_if.cpp
index 9cee76efee2e0..679980a62aaf7 100644
--- a/sycl/test-e2e/InlineAsm/asm_if.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_if.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -18,7 +18,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     bool switchField = false;
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
@@ -42,7 +42,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   if (verify_all_the_same(Functor.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
index 48415e1445af0..d2a53f14691da 100644
--- a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -26,7 +26,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
               : "=rw"(B[wiID])
@@ -44,7 +44,7 @@ int main() {
     input[i] = i;
 
   KernelFunctor<> f(input);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_loop.cpp b/sycl/test-e2e/InlineAsm/asm_loop.cpp
index 0c1cc06568d6a..56518dffbf4e3 100644
--- a/sycl/test-e2e/InlineAsm/asm_loop.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_loop.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
             CGH);
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
                        ".decl P1 v_type=P num_elts=16\n"
@@ -67,7 +67,7 @@ int main() {
   }
 
   KernelFunctor<> Functor(InputA, InputB);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   auto &C = Functor.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_mul.cpp b/sycl/test-e2e/InlineAsm/asm_mul.cpp
index 3f2b91c53cf59..b038a79f8abb1 100644
--- a/sycl/test-e2e/InlineAsm/asm_mul.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_mul.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -28,7 +28,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
               : "=rw"(C[wiID])
@@ -49,7 +49,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &C = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
index f51f5f58501ca..cdcf08b0f809d 100644
--- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -DTO_PASS -o %t.out.pass
 // RUN: %{run} %t.out.pass
 // RUN: %{build} -o %t.out
@@ -36,7 +36,7 @@ struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(TO_PASS)
           // The code below passing verification
           volatile int output = -1;
@@ -85,7 +85,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB, inputC);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(),
diff --git a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
index 5037fd68af41c..5b770bb5baa0d 100644
--- a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -12,10 +12,7 @@ int main() {
   sycl::queue Queue;
   sycl::device Device = Queue.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -25,12 +22,13 @@ int main() {
   // Submitting command group(work) to queue
   Queue.submit([&](sycl::handler &cgh) {
     // Executing kernel
-    cgh.parallel_for<no_operands_kernel>(
-        NumOfWorkItems,
-        [=](sycl::id<1> WIid) [[intel::reqd_sub_group_size(16)]] {
+    // clang-format off
+    cgh.parallel_for<no_operands_kernel>(NumOfWorkItems,
+        [=](sycl::id<1> WIid) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
-          asm("barrier");
+      asm("barrier");
 #endif
-        });
+    });
+    // clang-format on
   });
 }
diff --git a/sycl/test-e2e/InlineAsm/asm_no_output.cpp b/sycl/test-e2e/InlineAsm/asm_no_output.cpp
index a6d44b35a63dd..0622f4f8edb38 100644
--- a/sycl/test-e2e/InlineAsm/asm_no_output.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_no_output.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,7 +19,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
             cgh);
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           volatile int local_var = 47;
           local_var += C[0];
 #if defined(__SYCL_DEVICE_ONLY__)
@@ -37,7 +37,7 @@ template <typename T = dataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   if (verify_all_the_same(f.getOutputBufferData(), 0))
diff --git a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
index 10157938517bf..db2659d8e7077 100644
--- a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
 
     cgh.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
           asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
               : "+rw"(B[wiID])
@@ -47,7 +47,7 @@ int main() {
   }
 
   KernelFunctor<> f(inputA, inputB);
-  if (!launchInlineASMTest(f))
+  if (!launchInlineASMTest(f, {16}))
     return 0;
 
   auto &B = f.getOutputBufferData();
diff --git a/sycl/test-e2e/InlineAsm/asm_switch.cpp b/sycl/test-e2e/InlineAsm/asm_switch.cpp
index e77a7e6edc531..ccd81fd34af45 100644
--- a/sycl/test-e2e/InlineAsm/asm_switch.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_switch.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda || hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -18,7 +18,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     int switchField = 2;
     CGH.parallel_for<KernelFunctor<T>>(
         sycl::range<1>{this->getOutputBufferSize()},
-        [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] {
+        [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
           asm volatile("{\n"
@@ -62,7 +62,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
 
 int main() {
   KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
-  if (!launchInlineASMTest(Functor))
+  if (!launchInlineASMTest(Functor, {16}))
     return 0;
 
   if (verify_all_the_same(Functor.getOutputBufferData(), 7))
diff --git a/sycl/test-e2e/InlineAsm/include/asmhelper.h b/sycl/test-e2e/InlineAsm/include/asmhelper.h
index f96d056dcfec7..abc7d24668691 100644
--- a/sycl/test-e2e/InlineAsm/include/asmhelper.h
+++ b/sycl/test-e2e/InlineAsm/include/asmhelper.h
@@ -91,8 +91,8 @@ auto exception_handler = [](sycl::exception_list exceptions) {
 };
 
 template <typename F>
-bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
-                             std::vector<int> RequiredSGSizes = {}) {
+bool launchInlineASMTestImpl(F &f,
+                             const std::vector<int> &RequiredSGSizes = {}) {
   sycl::queue deviceQueue(sycl::gpu_selector_v, exception_handler);
   sycl::device device = deviceQueue.get_device();
 
@@ -101,22 +101,19 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
     return false;
   }
 
-  auto Vec = device.get_info<sycl::info::device::extensions>();
-  if (requires_particular_sg_size &&
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
-    std::cout << "Skipping test\n";
-    return false;
-  }
-
-  auto sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
-  if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(),
-                  [&](size_t RequiredSGSize) {
-                    return std::find(sg_sizes.begin(), sg_sizes.end(),
-                                     RequiredSGSize) == sg_sizes.end();
-                  })) {
-    std::cout << "Skipping test\n";
-    return false;
+  if (!RequiredSGSizes.empty()) {
+    auto supported_sg_sizes =
+        device.get_info<sycl::info::device::sub_group_sizes>();
+    if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(),
+                    [&](size_t RequiredSGSize) {
+                      return std::find(supported_sg_sizes.begin(),
+                                       supported_sg_sizes.end(),
+                                       RequiredSGSize) ==
+                             supported_sg_sizes.end();
+                    })) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
   }
 
   deviceQueue.submit(f).wait_and_throw();
@@ -128,13 +125,11 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true,
 ///
 /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
 template <typename F>
-bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true,
-                         bool exception_expected = false,
-                         std::vector<int> RequiredSGSizes = {}) {
+bool launchInlineASMTest(F &f, const std::vector<int> &RequiredSGSizes = {},
+                         bool exception_expected = false) {
   bool result = false;
   try {
-    result = launchInlineASMTestImpl(f, requires_particular_sg_size,
-                                     RequiredSGSizes);
+    result = launchInlineASMTestImpl(f, RequiredSGSizes);
   } catch (sycl::exception &e) {
     std::string what = e.what();
     if (exception_expected &&
diff --git a/sycl/test-e2e/InlineAsm/letter_example.cpp b/sycl/test-e2e/InlineAsm/letter_example.cpp
index 393c362c33fe0..c09c6d9f75ae4 100644
--- a/sycl/test-e2e/InlineAsm/letter_example.cpp
+++ b/sycl/test-e2e/InlineAsm/letter_example.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,10 +15,7 @@ int main() {
   sycl::queue q;
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -31,7 +28,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
            int i = idx[0];
            asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
index be2ee81836261..c541ac4d17838 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-32,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -16,10 +16,7 @@ int main() {
 
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -40,7 +37,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(32)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(32)]] {
            int i = idx[0];
 #if defined(__SYCL_DEVICE_ONLY__)
            asm volatile(R"a(
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
index 0470bb2431e9a..5ea597d5d1064 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -16,10 +16,7 @@ int main() {
 
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -37,7 +34,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
            int i = idx[0];
            volatile int tmp = a[i];
            tmp += 1;
diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
index 76ad3f6e95260..3da4e4c7e7e71 100644
--- a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
+++ b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: cuda, hip
-// REQUIRES: gpu,linux
+// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,10 +15,7 @@ int main() {
   sycl::queue q;
   sycl::device Device = q.get_device();
 
-  auto Vec = Device.get_info<sycl::info::device::extensions>();
-  if (!isInlineASMSupported(Device) ||
-      std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") ==
-          std::end(Vec)) {
+  if (!isInlineASMSupported(Device)) {
     std::cout << "Skipping test\n";
     return 0;
   }
@@ -31,7 +28,7 @@ int main() {
   q.submit([&](sycl::handler &cgh) {
      cgh.parallel_for<kernel_name>(
          sycl::range<1>(problem_size),
-         [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] {
+         [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] {
            int i = idx[0];
 #if defined(__SYCL_DEVICE_ONLY__)
            asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" : "=rw"(a[i]));
diff --git a/sycl/test-e2e/USM/align.cpp b/sycl/test-e2e/USM/align.cpp
new file mode 100755
index 0000000000000..9659ec78b9a88
--- /dev/null
+++ b/sycl/test-e2e/USM/align.cpp
@@ -0,0 +1,102 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// UNSUPPORTED: gpu
+
+// E2E tests for annotated USM allocation functions with alignment arguments
+// that are not powers of 2. Note this test does not work on gpu because some
+// tests expect non-templated aligned_alloc_xxx functions to return nullptr,
+// e.g. when the alignment argument is not a power of 2, while they fail to do
+// so when run on gpu. This maybe because the gpu runtime has different
+// behavior. Therefore, GPU is unsupported until issue #12638 gets resolved.
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <numeric>
+
+using namespace sycl;
+using namespace ext::oneapi::experimental;
+using namespace ext::intel::experimental;
+using alloc = usm::alloc;
+
+template <typename T> void testAlign(sycl::queue &q, unsigned align) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  constexpr int N = 10;
+  assert(align > 0 || (align & (align - 1)) == 0);
+
+  auto ADevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device(align, N, args...);
+  };
+  auto AHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host(align, N, args...);
+  };
+  auto AShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared(align, N, args...);
+  };
+  auto AAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc(align, N, args...);
+  };
+
+  auto ATDevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device<T>(align, N, args...);
+  };
+  auto ATHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host<T>(align, N, args...);
+  };
+  auto ATShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared<T>(align, N, args...);
+  };
+  auto ATAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc<T>(align, N, args...);
+  };
+
+  // Test cases that are expected to return null
+  auto check_null = [&q](auto AllocFn, int Line, int Case) {
+    decltype(AllocFn()) Ptr = AllocFn();
+    if (Ptr != nullptr) {
+      free(Ptr, q);
+      std::cout << "Failed at line " << Line << ", case " << Case << std::endl;
+      assert(false && "The return is not null!");
+    }
+  };
+
+  auto CheckNullAll = [&](auto Funcs, int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_null(Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  CheckNullAll(std::tuple{
+      // Case: aligned_alloc_xxx with no alignment property, and the alignment
+      // argument is not a power of 2, the result is nullptr
+      [&]() { return ADevice(3, q); }, [&]() { return ADevice(5, dev, Ctx); },
+      [&]() { return AHost(7, q); }, [&]() { return AHost(9, Ctx); },
+      [&]() { return AShared(114, q); },
+      [&]() { return AShared(1023, dev, Ctx); },
+      [&]() { return AAnnotated(15, q, alloc::device); },
+      [&]() { return AAnnotated(17, dev, Ctx, alloc::host); }
+      // Case: aligned_alloc_xxx<T> with no alignment property, and the
+      // alignment argument is not a power of 2, the result is nullptr
+      ,
+      [&]() { return ATDevice(3, q); }, [&]() { return ATDevice(5, dev, Ctx); },
+      [&]() { return ATHost(7, q); }, [&]() { return ATHost(9, Ctx); },
+      [&]() { return ATShared(1919, q); },
+      [&]() { return ATShared(11, dev, Ctx); },
+      [&]() { return ATAnnotated(15, q, alloc::device); },
+      [&]() { return ATAnnotated(17, dev, Ctx, alloc::host); }});
+}
+
+int main() {
+  sycl::queue q;
+  testAlign<char>(q, 4);
+  testAlign<int>(q, 128);
+  testAlign<std::complex<double>>(q, 4);
+  return 0;
+}
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index a25f775edc9e1..ef44dc972797b 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -669,7 +669,7 @@
     features.update(sg_size_features)
 
     be, dev = sycl_device.split(":")
-    features.add(dev.replace("acc", "accelerator"))
+    features.add(dev.replace("fpga", "accelerator"))
     # Use short names for LIT rules.
     features.add(be)
 
diff --git a/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp
new file mode 100644
index 0000000000000..d241a90568fc3
--- /dev/null
+++ b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp
@@ -0,0 +1,133 @@
+// RUN: %clangxx -fsycl -fpreview-breaking-changes -fsyntax-only %s -Xclang -verify
+// REQUIRES: preview-breaking-changes-supported
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::detail;
+
+namespace builtin_same_shape_v_tests {
+using swizzle1 = decltype(std::declval<vec<float, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<float, 2>>().swizzle<0, 0>());
+using swizzle3 = decltype(std::declval<vec<float, 2>>().swizzle<0, 0, 1>());
+
+static_assert(builtin_same_shape_v<float>);
+static_assert(builtin_same_shape_v<int, float>);
+static_assert(builtin_same_shape_v<marray<int, 2>>);
+static_assert(builtin_same_shape_v<marray<int, 2>, marray<float, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>, vec<float, 2>>);
+static_assert(builtin_same_shape_v<vec<int, 2>, swizzle2>);
+
+static_assert(!builtin_same_shape_v<float, marray<float, 1>>);
+static_assert(!builtin_same_shape_v<float, vec<float, 1>>);
+static_assert(!builtin_same_shape_v<marray<float, 1>, vec<float, 1>>);
+static_assert(!builtin_same_shape_v<float, swizzle1>);
+static_assert(!builtin_same_shape_v<marray<float, 1>, swizzle1>);
+static_assert(!builtin_same_shape_v<swizzle2, swizzle1>);
+} // namespace builtin_same_shape_v_tests
+
+namespace builtin_marray_impl_tests {
+// Integer functions/relational bitselect only accept fixed-width integer
+// element types for vector/swizzle elements. Make sure that our marray->vec
+// delegator can handle that.
+
+auto foo(char x) { return x; }
+auto foo(signed char x) { return x; }
+auto foo(unsigned char x) { return x; }
+auto foo(vec<int8_t, 2> x) { return x; }
+auto foo(vec<uint8_t, 2> x) { return x; }
+
+auto test() {
+  marray<char, 2> x;
+  marray<signed char, 2> y;
+  marray<unsigned char, 2> z;
+  auto TestOne = [](auto x) {
+    std::ignore = builtin_marray_impl([](auto x) { return foo(x); }, x);
+  };
+  TestOne(x);
+  TestOne(y);
+  TestOne(z);
+}
+} // namespace builtin_marray_impl_tests
+
+namespace builtin_enable_integer_tests {
+using swizzle1 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0, 0>());
+template <typename... Ts> void ignore() {}
+
+void test() {
+  // clang-format off
+  ignore<builtin_enable_integer_t<char>,
+         builtin_enable_integer_t<signed char>,
+         builtin_enable_integer_t<unsigned char>>();
+  // clang-format on
+
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>>,
+         builtin_enable_integer_t<vec<uint8_t, 2>>>();
+
+  ignore<builtin_enable_integer_t<char, char>>();
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>, vec<int8_t, 2>>>();
+  ignore<builtin_enable_integer_t<vec<int8_t, 2>, swizzle2>>();
+  ignore<builtin_enable_integer_t<swizzle2, swizzle2>>();
+
+  {
+    // Only one of char/signed char maps onto int8_t. The other type isn't a
+    // valid vector element type for integer builtins.
+
+    static_assert(std::is_signed_v<char>);
+
+    // clang-format off
+    // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::integer_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, sycl::vec<{{.*}}, 2>>'}}
+    // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}}
+    ignore<builtin_enable_integer_t<vec<signed char, 2>>, builtin_enable_integer_t<vec<char, 2>>>();
+    // clang-format on
+  }
+
+  // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::integer_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, char, signed char>'}}
+  // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}}
+  ignore<builtin_enable_integer_t<char, signed char>>();
+}
+} // namespace builtin_enable_integer_tests
+
+namespace builtin_enable_bitselect_tests {
+// Essentially the same as builtin_enable_integer_t + FP types support.
+using swizzle1 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0>());
+using swizzle2 = decltype(std::declval<vec<int8_t, 2>>().swizzle<0, 0>());
+template <typename... Ts> void ignore() {}
+
+void test() {
+  // clang-format off
+  ignore<builtin_enable_bitselect_t<char>,
+         builtin_enable_bitselect_t<signed char>,
+         builtin_enable_bitselect_t<unsigned char>,
+         builtin_enable_bitselect_t<float>>();
+  // clang-format on
+
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>>,
+         builtin_enable_bitselect_t<vec<uint8_t, 2>>,
+         builtin_enable_bitselect_t<vec<float, 2>>>();
+
+  ignore<builtin_enable_bitselect_t<char, char>>();
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>, vec<int8_t, 2>>>();
+  ignore<builtin_enable_bitselect_t<vec<int8_t, 2>, swizzle2>>();
+  ignore<builtin_enable_bitselect_t<swizzle2, swizzle2>>();
+
+  {
+    // Only one of char/signed char maps onto int8_t. The other type isn't a
+    // valid vector element type for integer builtins.
+
+    static_assert(std::is_signed_v<char>);
+
+    // clang-format off
+    // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::bitselect_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, sycl::vec<{{.*}}, 2>>'}}
+    // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}}
+    ignore<builtin_enable_bitselect_t<vec<signed char, 2>>, builtin_enable_bitselect_t<vec<char, 2>>>();
+    // clang-format on
+  }
+
+  // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable<sycl::detail::default_ret_type, sycl::detail::bitselect_elem_type, sycl::detail::any_shape, sycl::detail::same_elem_type, char, signed char>'}}
+  // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}}
+  ignore<builtin_enable_bitselect_t<char, signed char>>();
+}
+} // namespace builtin_enable_bitselect_tests
diff --git a/sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp b/sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp
similarity index 100%
rename from sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp
rename to sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp
diff --git a/sycl/test/basic_tests/builtins_templates.cpp b/sycl/test/basic_tests/builtins/builtins_templates.cpp
similarity index 100%
rename from sycl/test/basic_tests/builtins_templates.cpp
rename to sycl/test/basic_tests/builtins/builtins_templates.cpp
diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/builtins/relational_builtins.cpp
similarity index 96%
rename from sycl/test/basic_tests/relational_builtins.cpp
rename to sycl/test/basic_tests/builtins/relational_builtins.cpp
index ab9f3aa5c4326..eabd7e26c35cb 100644
--- a/sycl/test/basic_tests/relational_builtins.cpp
+++ b/sycl/test/basic_tests/builtins/relational_builtins.cpp
@@ -4,7 +4,7 @@
 // NOTE: Compile the test fully to ensure the library exports the right host
 // symbols.
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 // Some helper macros to verify return type of the builtins. To be used like
 // this
@@ -19,15 +19,15 @@ template <class... Args> struct CheckHelper {
   template <class F> static auto call(F f) { return f(Args()...); }
 };
 
-#define CHECK(EXPECTED, FUNC, ...)                            \
+#define CHECK(EXPECTED, FUNC, ...)                                             \
   {                                                                            \
     auto ret = CheckHelper<__VA_ARGS__>::call(                                 \
-        [](auto... args) { return cl::sycl::FUNC(args...); });                 \
-    static_assert(std::is_same_v<decltype(ret), EXPECTED>);                \
+        [](auto... args) { return sycl::FUNC(args...); });                     \
+    static_assert(std::is_same_v<decltype(ret), EXPECTED>);                    \
   }
 
 void foo() {
-  using namespace cl::sycl;
+  using namespace sycl;
   using boolm = marray<bool, 2>;
 
   using int16v = vec<int16_t, 2>;
@@ -247,9 +247,9 @@ void foo() {
 }
 
 int main() {
-  cl::sycl::queue q;
+  sycl::queue q;
   foo(); // Verify host.
-  q.submit([&](cl::sycl::handler &cgh) {
+  q.submit([&](sycl::handler &cgh) {
     cgh.single_task<class test>([]() {
       foo(); // verify device
     });
diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp
index 6aab1e433c7a7..14a1070567274 100644
--- a/sycl/test/basic_tests/types.cpp
+++ b/sycl/test/basic_tests/types.cpp
@@ -134,12 +134,13 @@ template <typename T, typename Expected> inline void checkVecNotReturnType() {
   using Vector = sycl::vec<T, N>;
 #if defined(__INTEL_PREVIEW_BREAKING_CHANGES)
   using ExpectedVector = sycl::vec<Expected, N>;
+  using OpNotResult = decltype(operator!(std::declval<Vector>()));
 #else
   using ExpectedVector = sycl::vec<T, N>;
-#endif
   using OpNotResult = decltype(std::declval<Vector>().operator!());
+#endif
   static_assert(std::is_same_v<OpNotResult, ExpectedVector>,
-                "Incorrect vec::operator! return type");
+                "Incorrect operator! return type");
 }
 
 // the math built-in testing ensures that the vec binary ops get tested,
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 3f653adb5e476..8305bd9b83b18 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
   acc_res = gather<float, 32>(local_acc, ioffset_n32, 0);
   acc_res = gather<float, 32>(local_acc, ioffset_n32, 0, mask_n32);
 
-  // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
   scatter(ptrf, ioffset_n32, usm, mask_n32);
 
   scatter(ptrf, ioffset_n32, usm);
@@ -1281,6 +1281,14 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
   scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16);
 
   scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view);
+
+  simd<uint32_t, 10> ioffset_n10(byte_offset32, 8);
+  simd<float, 10> usm_n10;
+
+  // Check special case to verify that for cases when N is not power of 2 llvm
+  // intrinsic is used
+  // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})
+  scatter(ptrf, ioffset_n10, usm_n10);
 }
 
 // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}
@@ -1302,6 +1310,7 @@ test_slm_gather_scatter(int byte_offset32) {
   simd<float, 32> slm;
   simd<float, 32> pass_thru;
   auto pass_thru_view = pass_thru.select<32, 1>();
+  auto slm_view = slm.select<32, 1>();
 
   // Test SLM gather using this plan:
   // 1) slm_gather(offsets): offsets is simd or simd_view
@@ -1373,4 +1382,66 @@ test_slm_gather_scatter(int byte_offset32) {
                                  props_align4);
   slm = slm_gather<float, 32, 2>(ioffset_n16_view, mask_n16, pass_thru_view,
                                  props_align4);
+
+  // Test SLM scatter using this plan:
+  // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view
+  // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view
+  // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1.
+
+  // 1) slm_scatter(offsets): offsets is simd or simd_view
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm);
+  slm_scatter<float, 32>(ioffset_n32_view, slm);
+  slm_scatter<float, 32>(ioffset_n32, slm_view);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view);
+
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, props_align8);
+
+  // 2) slm_gather(offsets, mask): offsets is simd or simd_view
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32);
+
+  // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32, props_align8);
+
+  // 4) slm_gather(...): same as (1), (2), above, but with VS > 1.
+  // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0)
+  // 4a) check VS > 1. no 'mask' operand first.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, props_align4);
+
+  // 4b) check VS > 1. Pass the 'mask' operand this time.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16, props_align4);
+
+  simd<uint32_t, 10> ioffset_n10(byte_offset32, 8);
+  simd<float, 10> usm_n10;
+  // Check special case to verify that for cases when N is not power of 2 llvm
+  // intrinsic is used
+  // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})
+  slm_scatter(ioffset_n10, usm_n10);
 }
diff --git a/sycl/unittests/allowlist/ParseAllowList.cpp b/sycl/unittests/allowlist/ParseAllowList.cpp
index faecab30aaeaf..40fbceb76616e 100644
--- a/sycl/unittests/allowlist/ParseAllowList.cpp
+++ b/sycl/unittests/allowlist/ParseAllowList.cpp
@@ -178,7 +178,8 @@ TEST(ParseAllowListTests, CheckAllValidBackendNameValuesAreProcessed) {
 
 TEST(ParseAllowListTests, CheckAllValidDeviceTypeValuesAreProcessed) {
   std::string AllowList;
-  for (const auto &SyclDeviceType : sycl::detail::getSyclDeviceTypeMap()) {
+  for (const auto &SyclDeviceType :
+       sycl::detail::getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
     if (!AllowList.empty())
       AllowList += "|";
     AllowList += "DeviceType:" + SyclDeviceType.first;