diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml
similarity index 100%
rename from .github/workflows/sycl_aws.yml
rename to .github/workflows/sycl-aws.yml
diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml
similarity index 98%
rename from .github/workflows/sycl_containers.yaml
rename to .github/workflows/sycl-containers.yaml
index 011d45ee9fee5..bb2e086929847 100644
--- a/.github/workflows/sycl_containers.yaml
+++ b/.github/workflows/sycl-containers.yaml
@@ -12,14 +12,14 @@ on:
       - 'devops/dependencies.json'
       - 'devops/scripts/install_drivers.sh'
       - 'devops/scripts/install_build_tools.sh'
-      - '.github/workflows/sycl_containers.yaml'
+      - '.github/workflows/sycl-containers.yaml'
   pull_request:
     paths:
       - 'devops/containers/**'
       - 'devops/dependencies.json'
       - 'devops/scripts/install_drivers.sh'
       - 'devops/scripts/install_build_tools.sh'
-      - '.github/workflows/sycl_containers.yaml'
+      - '.github/workflows/sycl-containers.yaml'
 
 jobs:
   base_image_ubuntu2204:
diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml
similarity index 100%
rename from .github/workflows/sycl_detect_changes.yml
rename to .github/workflows/sycl-detect-changes.yml
diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml
similarity index 97%
rename from .github/workflows/gh_pages.yml
rename to .github/workflows/sycl-docs.yml
index d0d4a130d6f78..73062642535cf 100644
--- a/.github/workflows/gh_pages.yml
+++ b/.github/workflows/sycl-docs.yml
@@ -7,7 +7,7 @@ on:
     branches:
     - sycl
     paths:
-    - '.github/workflows/gh_pages.yml'
+    - '.github/workflows/sycl-docs.yml'
     - 'clang/docs/**'
     - 'sycl/doc/**'
 
diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml
similarity index 100%
rename from .github/workflows/sycl_linux_build.yml
rename to .github/workflows/sycl-linux-build.yml
diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
similarity index 94%
rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
index 50dc1ca700035..06a5f13674768 100644
--- a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml
+++ b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml
@@ -48,7 +48,7 @@ jobs:
             image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build
             image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
             target_devices: ext_oneapi_cuda:gpu
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -62,7 +62,7 @@ jobs:
 
   aws_start:
     name: AWS Start
-    uses: ./.github/workflows/sycl_aws.yml
+    uses: ./.github/workflows/sycl-aws.yml
     secrets: inherit
     with:
       mode: start
@@ -71,7 +71,7 @@ jobs:
   linux_e2e_on_nightly_aws:
     name: '[AWS][CUDA] E2E on Nightly'
     needs: [aws_start]
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: CUDA
       runner: '["aws-cuda_${{ github.run_id }}-${{ github.run_attempt }}"]'
@@ -86,7 +86,7 @@ jobs:
     name: AWS Stop
     needs: [aws_start, linux_e2e_on_nightly_aws]
     if: always()
-    uses: ./.github/workflows/sycl_aws.yml
+    uses: ./.github/workflows/sycl-aws.yml
     secrets: inherit
     with:
       mode: stop
diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
similarity index 98%
rename from .github/workflows/sycl_linux_precommit_aws.yml
rename to .github/workflows/sycl-linux-precommit-aws.yml
index 3fce40107d90e..8ff68e725e447 100644
--- a/.github/workflows/sycl_linux_precommit_aws.yml
+++ b/.github/workflows/sycl-linux-precommit-aws.yml
@@ -54,7 +54,7 @@ jobs:
 
   e2e-cuda:
     needs: [aws-start]
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]'
diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml
similarity index 94%
rename from .github/workflows/sycl_linux_precommit.yml
rename to .github/workflows/sycl-linux-precommit.yml
index 8df5237d63d04..eb3f30aebcfe6 100644
--- a/.github/workflows/sycl_linux_precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -18,8 +18,8 @@ on:
     - 'clang/docs/**'
     - '**.md'
     - '**.rst'
-    - '.github/workflows/sycl_windows_*.yml'
-    - '.github/workflows/sycl_macos_*.yml'
+    - '.github/workflows/sycl-windows-*.yml'
+    - '.github/workflows/sycl-macos-*.yml'
     - 'devops/containers/**'
     - 'devops/actions/build_container/**'
 
@@ -30,12 +30,12 @@ concurrency:
 
 jobs:
   detect_changes:
-    uses: ./.github/workflows/sycl_detect_changes.yml
+    uses: ./.github/workflows/sycl-detect-changes.yml
 
   build:
     needs: [detect_changes]
     if: always() && success()
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     with:
       build_ref: ${{ github.sha }}
       merge_ref: ''
@@ -75,7 +75,7 @@ jobs:
             install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
             extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True
             env: '{"LIT_FILTER":"Matrix/"}'
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -111,7 +111,7 @@ jobs:
             runner: '["Linux", "gen12"]'
           - name: Perf tests on Intel Arc A-Series Graphics system
             runner: '["Linux", "arc"]'
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_linux_run_tests.yml
rename to .github/workflows/sycl-linux-run-tests.yml
diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml
similarity index 100%
rename from .github/workflows/sycl_macos_build_and_test.yml
rename to .github/workflows/sycl-macos-build-and-test.yml
diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml
similarity index 96%
rename from .github/workflows/sycl_nightly.yml
rename to .github/workflows/sycl-nightly.yml
index e802af89a29b4..28df4a8b8de13 100644
--- a/.github/workflows/sycl_nightly.yml
+++ b/.github/workflows/sycl-nightly.yml
@@ -8,7 +8,7 @@ on:
 jobs:
   ubuntu2204_build:
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     secrets: inherit
     with:
       build_cache_root: "/__w/"
@@ -59,7 +59,7 @@ jobs:
             image: ghcr.io/intel/llvm/ubuntu2204_build:latest
             image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
             target_devices: ext_oneapi_cuda:gpu
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix.runner }}
@@ -74,7 +74,7 @@ jobs:
       sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
 
   build-win:
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
     if: github.repository == 'intel/llvm'
     with:
       retention-days: 90
@@ -89,7 +89,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml
similarity index 88%
rename from .github/workflows/sycl_post_commit.yml
rename to .github/workflows/sycl-post-commit.yml
index 76dd3ac88e3d2..00205a52b03e1 100644
--- a/.github/workflows/sycl_post_commit.yml
+++ b/.github/workflows/sycl-post-commit.yml
@@ -12,10 +12,10 @@ on:
     - sycl
     - sycl-devops-pr/**
     paths:
-    - .github/workflows/sycl_post_commit.yml
-    - .github/workflows/sycl_linux_build.yml
-    - .github/workflows/sycl_linux_run_tests.yml
-    - .github/workflows/sycl_macos_build_and_test.yml
+    - .github/workflows/sycl-post-commit.yml
+    - .github/workflows/sycl-linux-build.yml
+    - .github/workflows/sycl-linux-run-tests.yml
+    - .github/workflows/sycl-macos-build-and-test.yml
     - ./devops/actions/cleanup
     - ./devops/actions/cached_checkout
 
@@ -23,7 +23,7 @@ jobs:
   build-lin:
     name: Linux (Self build + shared libraries + no-assertions)
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build.yml
+    uses: ./.github/workflows/sycl-linux-build.yml
     with:
       build_cache_root: "/__w/llvm"
       build_cache_suffix: sprod_shared
@@ -63,7 +63,7 @@ jobs:
             env: '{"LIT_FILTER":"PerformanceTests/"}'
             extra_lit_opts: -a -j 1 --param enable-perf-tests=True
             target_devices: all
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
@@ -87,7 +87,7 @@ jobs:
       always()
       && success()
       && github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
 
   e2e-win:
     needs: build-win
@@ -96,7 +96,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build-win.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
@@ -105,4 +105,4 @@ jobs:
   macos_default:
     name: macOS
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_macos_build_and_test.yml
+    uses: ./.github/workflows/sycl-macos-build-and-test.yml
diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml
similarity index 100%
rename from .github/workflows/sycl_stale_issues.yml
rename to .github/workflows/sycl-stale-issues.yml
diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml
similarity index 100%
rename from .github/workflows/sync-main.yml
rename to .github/workflows/sycl-sync-main.yml
diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml
similarity index 100%
rename from .github/workflows/sycl_update_gpu_driver.yml
rename to .github/workflows/sycl-update-gpu-driver.yml
diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml
similarity index 100%
rename from .github/workflows/sycl_windows_build.yml
rename to .github/workflows/sycl-windows-build.yml
diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml
similarity index 80%
rename from .github/workflows/sycl_windows_precommit.yml
rename to .github/workflows/sycl-windows-precommit.yml
index 2b7883895d97c..3b96b463560a9 100644
--- a/.github/workflows/sycl_windows_precommit.yml
+++ b/.github/workflows/sycl-windows-precommit.yml
@@ -16,9 +16,9 @@ on:
     - 'clang/docs/**'
     - '**.md'
     - '**.rst'
-    - '.github/workflows/sycl_linux_*.yml'
-    - '.github/workflows/sycl_precommit_aws.yml'
-    - '.github/workflows/sycl_macos_*.yml'
+    - '.github/workflows/sycl-linux-*.yml'
+    - '.github/workflows/sycl-precommit-aws.yml'
+    - '.github/workflows/sycl-macos-*.yml'
     - 'devops/containers/**'
     - 'devops/actions/build_container/**'
 
@@ -32,14 +32,14 @@ concurrency:
 
 jobs:
   detect_changes:
-    uses: ./.github/workflows/sycl_detect_changes.yml
+    uses: ./.github/workflows/sycl-detect-changes.yml
 
   build:
     needs: [detect_changes]
     if: |
       always() && success()
       && github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_windows_build.yml
+    uses: ./.github/workflows/sycl-windows-build.yml
     with:
       changes: ${{ needs.detect_changes.outputs.filters }}
 
@@ -50,7 +50,7 @@ jobs:
       always()
       && !cancelled()
       && needs.build.outputs.build_conclusion == 'success'
-    uses: ./.github/workflows/sycl_windows_run_tests.yml
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
     with:
       name: Intel GEN12 Graphics with Level Zero
       runner: '["Windows","gen12"]'
diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml
similarity index 100%
rename from .github/workflows/sycl_windows_run_tests.yml
rename to .github/workflows/sycl-windows-run-tests.yml
diff --git a/README.md b/README.md
index bf72abab3e686..8ab5cda1621bb 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ For general contribution process see [CONTRIBUTING.md](./CONTRIBUTING.md)
 
 [![oneAPI logo](https://spec.oneapi.io/oneapi-logo-white-scaled.jpg)](https://www.oneapi.io/)
 
-[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml)
-[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/gh_pages.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/gh_pages.yml)
+[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml)
+[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml)
 
 The DPC++ is a LLVM-based compiler project that implements compiler and runtime
 support for the SYCL\* language. The project is hosted in the
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2d0f05f6ca35b..e32047292ab8a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8404,6 +8404,9 @@ def : CLFlag<"O1">, Alias<_SLASH_O>, AliasArgs<["1"]>,
   HelpText<"Optimize for size  (like /Og     /Os /Oy /Ob2 /GF /Gy)">;
 def : CLFlag<"O2">, Alias<_SLASH_O>, AliasArgs<["2"]>,
   HelpText<"Optimize for speed (like /Og /Oi /Ot /Oy /Ob2 /GF /Gy)">;
+def : CLFlag<"O3">, Alias<_SLASH_O>, AliasArgs<["3"]>,
+  HelpText<"Optimize for maximum speed and enable more aggressive optimizations"
+  " that may not improve performance on some programs">;
 def : CLFlag<"Ob0">, Alias<_SLASH_O>, AliasArgs<["b0"]>,
   HelpText<"Disable function inlining">;
 def : CLFlag<"Ob1">, Alias<_SLASH_O>, AliasArgs<["b1"]>,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ca1752b24b062..0a90073cf50d0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -23271,21 +23271,11 @@ static bool hasFuncNameRequestedFPAccuracy(StringRef Name,
   return (FuncMapIt != LangOpts.FPAccuracyFuncMap.end());
 }
 
-llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
+llvm::CallInst *CodeGenFunction::MaybeEmitFPBuiltinofFD(
     llvm::FunctionType *IRFuncTy, const SmallVectorImpl<llvm::Value *> &IRArgs,
-    llvm::Value *FnPtr, const FunctionDecl *FD) {
-  llvm::Function *Func;
+    llvm::Value *FnPtr, StringRef Name, unsigned FDBuiltinID) {
   unsigned FPAccuracyIntrinsicID = 0;
-  StringRef Name;
-  if (CurrentBuiltinID == 0) {
-    // Even if the current function doesn't have a clang builtin, create
-    // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
-    // an NoBuiltin attribute.
-    if (FD->hasAttr<NoBuiltinAttr>() ||
-        !FD->getNameInfo().getName().isIdentifier())
-      return nullptr;
-
-    Name = FD->getName();
+  if (FDBuiltinID == 0) {
     FPAccuracyIntrinsicID =
         llvm::StringSwitch<unsigned>(Name)
             .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
@@ -23300,9 +23290,7 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
   } else {
     // The function has a clang builtin. Create an attribute for it
     // only if it has an fpbuiltin intrinsic.
-    unsigned BuiltinID = getCurrentBuiltinID();
-    Name = CGM.getContext().BuiltinInfo.getName(BuiltinID);
-    switch (BuiltinID) {
+    switch (FDBuiltinID) {
     default:
       // If the function has a clang builtin but doesn't have an
       // fpbuiltin, it will be generated with no 'fpbuiltin-max-error'
@@ -23384,7 +23372,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
   const LangOptions &LangOpts = getLangOpts();
   if (hasFuncNameRequestedFPAccuracy(Name, LangOpts) ||
       !LangOpts.FPAccuracyVal.empty()) {
-    Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
+    llvm::Function *Func =
+        CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
     return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
                                      FPAccuracyIntrinsicID);
   }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d55023875f2fc..442059fb03789 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5707,8 +5707,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     if (!getLangOpts().FPAccuracyFuncMap.empty() ||
         !getLangOpts().FPAccuracyVal.empty()) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
-      if (FD) {
-        CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);
+      if (FD && FD->getNameInfo().getName().isIdentifier()) {
+        CI = MaybeEmitFPBuiltinofFD(IRFuncTy, IRCallArgs, CalleePtr,
+                                    FD->getName(), FD->getBuiltinID());
         if (CI)
           return RValue::get(CI);
       }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 3ddd05cb53d8e..04c3a715da205 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4434,9 +4434,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   RValue EmitIntelSYCLPtrAnnotationBuiltin(const CallExpr *E);
 
   llvm::CallInst *
-  EmitFPBuiltinIndirectCall(llvm::FunctionType *IRFuncTy,
-                            const SmallVectorImpl<llvm::Value *> &IRArgs,
-                            llvm::Value *FnPtr, const FunctionDecl *FD);
+  MaybeEmitFPBuiltinofFD(llvm::FunctionType *IRFuncTy,
+                         const SmallVectorImpl<llvm::Value *> &IRArgs,
+                         llvm::Value *FnPtr, StringRef Name,
+                         unsigned FDBuiltinID);
 
   enum class MSVCIntrin;
   llvm::Value *EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 05fc203797f25..7642d3d79a666 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -342,8 +342,8 @@ llvm::Type *CodeGenTypes::ConvertSYCLJointMatrixINTELType(RecordDecl *RD) {
   if (CompTy->isStructTy()) {
     StringRef LlvmTyName = CompTy->getStructName();
     // Emit half/int16/float for sycl[::*]::{half,bfloat16,tf32}
-    if (LlvmTyName.startswith("class.sycl::") ||
-        LlvmTyName.startswith("class.__sycl_internal::"))
+    if (LlvmTyName.starts_with("class.sycl::") ||
+        LlvmTyName.starts_with("class.__sycl_internal::"))
       LlvmTyName = LlvmTyName.rsplit("::").second;
     if (LlvmTyName == "half") {
       CompTy = llvm::Type::getHalfTy(getLLVMContext());
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 21a563ce99c45..6265280877e9f 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3518,7 +3518,7 @@ getLinkerArgs(Compilation &C, DerivedArgList &Args, bool IncludeObj = false) {
   // manner than the OpenMP processing.  We should try and refactor this
   // to use the OpenMP flow (adding -l<name> to the llvm-link step)
   auto resolveStaticLib = [&](StringRef LibName, bool IsStatic) -> bool {
-    if (!LibName.startswith("-l"))
+    if (!LibName.starts_with("-l"))
       return false;
     for (auto &LPath : LibPaths) {
       if (!IsStatic) {
@@ -3663,7 +3663,7 @@ static bool IsSYCLDeviceLibObj(std::string ObjFilePath, bool isMSVCEnv) {
   StringRef ObjFileName = llvm::sys::path::filename(ObjFilePath);
   StringRef ObjSuffix = isMSVCEnv ? ".obj" : ".o";
   bool Ret =
-      (ObjFileName.startswith("libsycl-") && ObjFileName.endswith(ObjSuffix))
+      (ObjFileName.starts_with("libsycl-") && ObjFileName.ends_with(ObjSuffix))
           ? true
           : false;
   return Ret;
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index ae33f79198692..01887706d8757 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -344,7 +344,7 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs
   // libc++.so in D.Dir+"/../lib/". Detect this path.
   // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported.
-  if (StringRef(D.Dir).startswith(SysRoot) &&
+  if (StringRef(D.Dir).starts_with(SysRoot) &&
       (Args.hasArg(options::OPT_fsycl) ||
        D.getVFS().exists(D.Dir + "/../lib/libsycl.so")))
     addPathIfExists(D, D.Dir + "/../lib", Paths);
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index c1b912db596e8..52f811621ba82 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -930,6 +930,7 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL,
       break;
     case '1':
     case '2':
+    case '3':
     case 'x':
     case 'd':
       // Ignore /O[12xd] flags that aren't the last one on the command line.
@@ -946,11 +947,14 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL,
         } else if (OptChar == '2' || OptChar == 'x') {
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin));
           DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "2");
+        } else if (OptChar == '3') {
+          DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin));
+          DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "3");
         }
         if (SupportsForcingFramePointer &&
             !DAL.hasArgNoClaim(options::OPT_fno_omit_frame_pointer))
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_fomit_frame_pointer));
-        if (OptChar == '1' || OptChar == '2')
+        if (OptChar == '1' || OptChar == '2' || OptChar == '3')
           DAL.AddFlagArg(A, Opts.getOption(options::OPT_ffunction_sections));
       }
       break;
@@ -1070,7 +1074,8 @@ MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
         // OptChar does not expand; it's an argument to the previous char.
         continue;
       }
-      if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || OptChar == 'd')
+      if (OptChar == '1' || OptChar == '2' || OptChar == 'x' ||
+          OptChar == 'd' || OptChar == '3')
         ExpandChar = OptStr.data() + I;
     }
   }
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 32ad99ba384b0..ca68b97858027 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -401,8 +401,9 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
           LibPostfix = ".cubin";
       }
       StringRef LibSyclPrefix("libsycl-");
-      if (!InputFilename.startswith(LibSyclPrefix) ||
-          !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2))
+      if (!InputFilename.starts_with(LibSyclPrefix) ||
+          !InputFilename.ends_with(LibPostfix) ||
+          (InputFilename.count('-') < 2))
         return false;
       // Skip the prefix "libsycl-"
       std::string PureLibName =
@@ -419,7 +420,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
             PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos);
       }
       for (const auto &L : SYCLDeviceLibList) {
-        if (StringRef(PureLibName).startswith(L))
+        if (StringRef(PureLibName).starts_with(L))
           return true;
       }
       return false;
@@ -1354,7 +1355,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple,
       for (auto *A : Args) {
         if (!A->getOption().matches(options::OPT_Xsycl_backend_EQ))
           continue;
-        if (StringRef(A->getValue()).startswith("intel_gpu"))
+        if (StringRef(A->getValue()).starts_with("intel_gpu"))
           TargArgs.push_back(A->getValue(1));
       }
       if (llvm::find_if(TargArgs, [&](auto Cur) {
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
index 8e1c97dbbb546..2e1a30be76eda 100644
--- a/clang/lib/Driver/ToolChains/SYCL.h
+++ b/clang/lib/Driver/ToolChains/SYCL.h
@@ -125,7 +125,7 @@ constexpr char AmdGPU[] = "amd_gpu_";
 template <auto GPUArh> std::optional<StringRef> isGPUTarget(StringRef Target) {
   // Handle target specifications that resemble '(intel, nvidia, amd)_gpu_*'
   // here.
-  if (Target.startswith(GPUArh)) {
+  if (Target.starts_with(GPUArh)) {
     return resolveGenDevice(Target);
   }
   return  std::nullopt;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index dcd567d32b698..2c5f907b4797e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -282,11 +282,11 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
             .Default(false);
       };
       if ((getEmissionReason(FDecl) == Sema::DeviceDiagnosticReason::Sycl) &&
-          Id && !Id->getName().startswith("__spirv_") &&
-          !Id->getName().startswith("__sycl_") &&
-          !Id->getName().startswith("__devicelib_ConvertBF16ToFINTEL") &&
-          !Id->getName().startswith("__devicelib_ConvertFToBF16INTEL") &&
-          !Id->getName().startswith("__assert_fail") &&
+          Id && !Id->getName().starts_with("__spirv_") &&
+          !Id->getName().starts_with("__sycl_") &&
+          !Id->getName().starts_with("__devicelib_ConvertBF16ToFINTEL") &&
+          !Id->getName().starts_with("__devicelib_ConvertFToBF16INTEL") &&
+          !Id->getName().starts_with("__assert_fail") &&
           !isMsvcMathFn(Id->getName())) {
         SYCLDiagIfDeviceCode(
             *Locs.begin(), diag::err_sycl_device_function_is_called_from_esimd,
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index e2a8d10ded1ab..d0f8ddbe7193c 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -4295,7 +4295,7 @@ static void CheckSYCL2020SubGroupSizes(Sema &S, FunctionDecl *SYCLKernel,
   // No need to validate __spirv routines here since they
   // are mapped to the equivalent SPIRV operations.
   const IdentifierInfo *II = FD->getIdentifier();
-  if (II && II->getName().startswith("__spirv_"))
+  if (II && II->getName().starts_with("__spirv_"))
     return;
 
   // Else we need to figure out why they don't match.
diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c
index 74de6a1f72c80..3fdde4443b8c8 100644
--- a/clang/test/CodeGen/fp-accuracy.c
+++ b/clang/test/CodeGen/fp-accuracy.c
@@ -177,7 +177,7 @@ double rsqrt(double);
 // CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
-// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 // CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
@@ -203,6 +203,19 @@ double rsqrt(double);
 // CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]]
 // CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
 
+// CHECK-F3-LABEL: define dso_local void @f2
+// CHECK-F3: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW]]
+// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM]]
+// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]]
+
+// CHECK-F3-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F3-LABEL: define dso_local void @f4
+// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]]
+
 // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0"
@@ -414,7 +427,7 @@ void f1(float a, float b) {
 // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]]
 // CHECK-F2: call float @tanf(float {{.*}})
 //
-// CHECK-LABEL-F4: define dso_local void @f2
+// CHECK-F4-LABEL: define dso_local void @f2
 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
@@ -422,6 +435,17 @@ void f1(float a, float b) {
 // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]]
 // CHECK-F4: call float @tanf(float {{.*}})
 //
+// CHECK-F4-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F4-LABEL: define dso_local void @f3
+// CHECK-F4: call float @fake_exp10(float {{.*}})
+
+// CHECK-F4-LABEL: define dso_local void @f4
+// CHECK-F4: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
+// CHECK-F4: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]]
+
+// CHECK-F4: attributes #[[ATTR_F4_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
+
 // CHECK-F5-LABEL: define dso_local void @f2
 // CHECK-F5: call float @llvm.cos.f32(float {{.*}})
 // CHECK-F5: call float @llvm.sin.f32(float {{.*}})
@@ -430,6 +454,15 @@ void f1(float a, float b) {
 // CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}})
 // CHECK-F5: call float @tanf(float {{.*}})
 //
+// CHECK-F5-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F5-LABEL: define dso_local void @f3
+// CHECK-F5: call float @fake_exp10(float {{.*}})
+
+// CHECK-F5-LABEL: define dso_local void @f4
+// CHECK-F5: call double @llvm.exp.f64(double {{.*}})
+// CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM]]
+
 // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 //
@@ -441,6 +474,15 @@ void f1(float a, float b) {
 // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]]
 // CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]]
 //
+// CHECK-F6-LABEL: define dso_local float @fake_exp10
+//
+// CHECK-F6-LABEL: define dso_local void @f3
+// CHECK-F6: call float @fake_exp10(float {{.*}})
+
+// CHECK-F6-LABEL: define dso_local void @f4
+// CHECK-F6: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]]
+// CHECK-F6: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]]
+
 // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0"
 // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 //
@@ -454,12 +496,36 @@ void f1(float a, float b) {
 
 // CHECK-LABEL: define dso_local void @f3
 // CHECK: call float @fake_exp10(float {{.*}})
+
+// CHECK-LABEL: define dso_local void @f4
+// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]]
+
+// CHECK-F1-LABEL: define dso_local void @f3
 // CHECK-F1: call float @fake_exp10(float {{.*}})
+
+// CHECK-F1-LABEL: define dso_local void @f4
+// CHECK-F1: call double @llvm.exp.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]]
+
+// CHECK-F2-LABEL: define dso_local float @fake_exp10
+
+// CHECK-F2-LABEL: define dso_local void @f3
 // CHECK-F2: call float @fake_exp10(float {{.*}})
 
+// CHECK-F2-LABEL: define dso_local void @f4
+// CHECK-F2: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.cos.f64(double {{.*}})  #[[ATTR_F2_CUDA]]
+
+// CHECK-SPIR-LABEL: define dso_local spir_func float @fake_exp10
+//
 // CHECK-SPIR-LABEL: define dso_local spir_func void @f3
 // CHECK-SPIR: call spir_func float @fake_exp10(float {{.*}})
 
+// CHECK-SPIR-LABEL: define dso_local spir_func void @f4
+// CHECK-SPIR: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_SYCL1]]
+
 // CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
 
 // CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
@@ -526,6 +592,10 @@ void f1(float a, float b) {
 // CHECK-DEFAULT-LABEL: define dso_local void @f3
 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}})
 
+// CHECK-DEFAULT-LABEL: define dso_local void @f4
+// CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}})
+
 void f2(float a, float b) {
   float sin = 0.f, cos = 0.f;
 
@@ -541,3 +611,16 @@ float fake_exp10(float a) __attribute__((no_builtin)){}
 void f3(float a, float b) {
   a = fake_exp10(b);
 }
+
+#define sz 2
+double in[sz];
+double out[sz];
+
+double getInput(int i) {
+  return in[i];
+}
+
+void f4() {
+  for (int i = 0; i < sz; i++)
+    out[i] = cos(exp(getInput(i)));
+}
diff --git a/clang/test/Driver/Xarch.c b/clang/test/Driver/Xarch.c
index f7693fb689d58..ec58211d6fba0 100644
--- a/clang/test/Driver/Xarch.c
+++ b/clang/test/Driver/Xarch.c
@@ -1,10 +1,10 @@
-// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
-// O3ONCE: "-O3"
-// O3ONCE-NOT: "-O3"
+// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5ONCE %s
+// O5ONCE: "-O5"
+// O5ONCE-NOT: "-O5"
 
-// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s
-// O3NONE-NOT: "-O3"
-// O3NONE: argument unused during compilation: '-Xarch_i386 -O3'
+// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5NONE %s
+// O5NONE-NOT: "-O5"
+// O5NONE: argument unused during compilation: '-Xarch_i386 -O5'
 
 // RUN: not %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -o -Xarch_i386 -S %s -S -Xarch_i386 -o 2>&1 | FileCheck -check-prefix=INVALID %s
 // INVALID: error: invalid Xarch argument: '-Xarch_i386 -o'
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 5b6dfe308a76e..2f60637f7c010 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -207,6 +207,10 @@
 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O2 /O2 -### -- %s 2>&1 | FileCheck -check-prefix=O2O2 %s
 // O2O2: "-O2"
 
+// RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O3 -### -- %s 2>&1 | FileCheck -check-prefix=O3 %s
+// O3: -mframe-pointer=none
+// O3: -O3
+
 // RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1
 
 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s
diff --git a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
index c9fa2f87a072f..5dbf044675106 100644
--- a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
+++ b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp
@@ -173,7 +173,7 @@ int main(int argc, const char **argv) {
       // possibly reusing ClangOffloadBundler's 'OffloadTargetInfo'.
       for (const std::string &Target : Targets) {
         std::string Prefix = Target + ".";
-        if (Symbol.startswith(Prefix))
+        if (Symbol.starts_with(Prefix))
           Target2Symbols[Target].insert(
               Symbol.substr(Prefix.size(), Len - Prefix.size()));
       }
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
index e104ab115e970..480ba1944ebba 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp
@@ -34,13 +34,13 @@ constexpr char SLM_ALLOCATOR_DTOR_SUFFIX[] = "EED2Ev";
 bool isSlmAllocatorConstructor(const Function &F) {
   auto Name = F.getName();
   return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) &&
-         Name.endswith(SLM_ALLOCATOR_CTOR_SUFFIX);
+         Name.ends_with(SLM_ALLOCATOR_CTOR_SUFFIX);
 }
 
 bool isSlmAllocatorDestructor(const Function &F) {
   auto Name = F.getName();
   return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) &&
-         Name.endswith(SLM_ALLOCATOR_DTOR_SUFFIX);
+         Name.ends_with(SLM_ALLOCATOR_DTOR_SUFFIX);
 }
 
 bool isSlmInit(const Function &F) {
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index 9a3aa8437a590..6ec1102f402ba 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -1596,15 +1596,15 @@ SmallPtrSet<Type *, 4> collectGenXVolatileTypes(Module &M) {
     // TODO FIXME relying on type name in LLVM IR is fragile, needs rework
     if (!GTy || !GTy->getName()
                      .rtrim(".0123456789")
-                     .endswith("sycl::_V1::ext::intel::esimd::simd"))
+                     .ends_with("sycl::_V1::ext::intel::esimd::simd"))
       continue;
     assert(GTy->getNumContainedTypes() == 1);
     auto VTy = GTy->getContainedType(0);
     if ((GTy = dyn_cast<StructType>(VTy))) {
-      assert(
-          GTy->getName()
-              .rtrim(".0123456789")
-              .endswith("sycl::_V1::ext::intel::esimd::detail::simd_obj_impl"));
+      assert(GTy->getName()
+                 .rtrim(".0123456789")
+                 .ends_with(
+                     "sycl::_V1::ext::intel::esimd::detail::simd_obj_impl"));
       VTy = GTy->getContainedType(0);
     }
     assert(VTy->isVectorTy());
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 264a8b4bc817a..2624e26116823 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -283,7 +283,7 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
 static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
   Function *Res;
   const char GetPrefix[] = "__dpcpp_nativecpu_get";
-  if (Name.startswith(GetPrefix)) {
+  if (Name.starts_with(GetPrefix)) {
     Res = addGetFunc(M, Name, StateType);
   } else if (Name == NativeCPUSetLocalId) {
     Res = addSetLocalIdFunc(M, Name, StateType);
diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp
index 2a3586ba49339..ffb6394913fc9 100644
--- a/llvm/lib/Support/PropertySetIO.cpp
+++ b/llvm/lib/Support/PropertySetIO.cpp
@@ -37,7 +37,7 @@ PropertySetRegistry::read(const MemoryBuffer *Buf) {
 
   for (line_iterator LI(*Buf); !LI.is_at_end(); LI++) {
     // see if this line starts a new property set
-    if (LI->startswith("[")) {
+    if (LI->starts_with("[")) {
       // yes - parse the category (property name)
       auto EndPos = LI->rfind(']');
       if (EndPos == StringRef::npos)
diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp
index 434e7d7cb7670..ec47391450100 100644
--- a/llvm/lib/Support/SimpleTable.cpp
+++ b/llvm/lib/Support/SimpleTable.cpp
@@ -214,8 +214,8 @@ Expected<SimpleTable::UPtrTy> SimpleTable::read(MemoryBuffer *Buf,
     return std::make_unique<SimpleTable>();
   UPtrTy Res;
 
-  if (LI->startswith(COL_TITLE_LINE_OPEN)) {
-    if (!LI->endswith(COL_TITLE_LINE_CLOSE))
+  if (LI->starts_with(COL_TITLE_LINE_OPEN)) {
+    if (!LI->ends_with(COL_TITLE_LINE_CLOSE))
       return createStringError(errc::invalid_argument, "malformed title line");
     // column titles present
     StringRef L = LI->substr(1, LI->size() - 2); // trim '[' and ']'
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index f0c4cd875f6fe..1e923ebae21f9 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -726,7 +726,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
       (SubArchName.ends_with("r6el") || SubArchName.ends_with("r6")))
     return Triple::MipsSubArch_r6;
 
-  if (SubArchName.startswith("spir")) {
+  if (SubArchName.starts_with("spir")) {
     StringRef SA(SubArchName);
     if (SA.consume_front("spir64_") || SA.consume_front("spir_")) {
       if (SA == "fpga")
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 1891e10679a0f..f37f1fdcfc2e9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -838,7 +838,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   //   //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
   // ```
   // can't be lowered by SPIR-V translator to "standard" format.
-  if (StringRef(Trunc.getModule()->getTargetTriple()).startswith("spir"))
+  if (StringRef(Trunc.getModule()->getTargetTriple()).starts_with("spir"))
     return nullptr;
 
   // Whenever an element is extracted from a vector, and then truncated,
diff --git a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
index b8dd7cda8883c..105643062aa90 100644
--- a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp
@@ -243,7 +243,7 @@ bool insertAtomicInstrumentationCall(Module &M, StringRef Name,
 
 PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
-  assert(StringRef(M.getTargetTriple()).startswith("spir"));
+  assert(StringRef(M.getTargetTriple()).starts_with("spir"));
   bool IRModified = false;
   std::vector<StringRef> SPIRVCrossWGInstuctions = {
       SPIRV_CONTROL_BARRIER, SPIRV_GROUP_ALL,  SPIRV_GROUP_ANY,
@@ -299,7 +299,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
         if (std::any_of(SPIRVCrossWGInstuctions.begin(),
                         SPIRVCrossWGInstuctions.end(),
                         [&CalleeName](StringRef Name) {
-                          return CalleeName.startswith(Name);
+                          return CalleeName.starts_with(Name);
                         })) {
           Instruction *InstAfterBarrier = CI->getNextNode();
           const DebugLoc &DL = CI->getDebugLoc();
@@ -307,7 +307,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M,
           insertSimpleInstrumentationCall(M, ITT_ANNOTATION_WI_RESUME,
                                           InstAfterBarrier, DL);
           IRModified = true;
-        } else if (CalleeName.startswith(SPIRV_ATOMIC_INST)) {
+        } else if (CalleeName.starts_with(SPIRV_ATOMIC_INST)) {
           Instruction *InstAfterAtomic = CI->getNextNode();
           IRModified |= insertAtomicInstrumentationCall(
               M, ITT_ANNOTATION_ATOMIC_START, CI, CI, CalleeName);
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
index a31ad613f9918..87cbf42da2df2 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
@@ -79,7 +79,7 @@ bool isSpirvSyclBuiltin(StringRef FName) {
   // now skip the digits
   FName = FName.drop_while([](char C) { return std::isdigit(C); });
 
-  return FName.startswith("__spirv_") || FName.startswith("__sycl_");
+  return FName.starts_with("__spirv_") || FName.starts_with("__sycl_");
 }
 
 // Return true if the function is a ESIMD builtin
@@ -91,12 +91,12 @@ bool isESIMDBuiltin(StringRef FName) {
   // now skip the digits
   FName = FName.drop_while([](char C) { return std::isdigit(C); });
 
-  return FName.startswith("__esimd_");
+  return FName.starts_with("__esimd_");
 }
 
 // Return true if the function name starts with "__builtin_"
 bool isGenericBuiltin(StringRef FName) {
-  return FName.startswith("__builtin_");
+  return FName.starts_with("__builtin_");
 }
 
 bool isKernel(const Function &F) {
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 6ca581d8caf14..1b6cdefb9a541 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -696,7 +696,7 @@ uint32_t llvm::getSYCLDeviceLibReqMask(const Module &M) {
     return 0;
   uint32_t ReqMask = 0;
   for (const Function &SF : M) {
-    if (SF.getName().startswith(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) {
+    if (SF.getName().starts_with(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) {
       assert(SF.getCallingConv() == CallingConv::SPIR_FUNC);
       uint32_t DeviceLibBits = getDeviceLibBits(SF.getName().str());
       ReqMask |= DeviceLibBits;
diff --git a/llvm/tools/sycl-post-link/SpecConstants.cpp b/llvm/tools/sycl-post-link/SpecConstants.cpp
index dc5f1b2fd3f2c..66c2cc5ef1818 100644
--- a/llvm/tools/sycl-post-link/SpecConstants.cpp
+++ b/llvm/tools/sycl-post-link/SpecConstants.cpp
@@ -801,8 +801,8 @@ PreservedAnalyses SpecConstantsPass::run(Module &M,
     if (!F.isDeclaration())
       continue;
 
-    if (!F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) &&
-        !F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
+    if (!F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) &&
+        !F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
       continue;
 
     SmallVector<CallInst *, 32> SCIntrCalls;
@@ -1014,8 +1014,8 @@ bool SpecConstantsPass::collectSpecConstantDefaultValuesMetadata(
 
 bool llvm::checkModuleContainsSpecConsts(const Module &M) {
   for (const Function &F : M.functions()) {
-    if (F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) ||
-        F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
+    if (F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) ||
+        F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL))
       return true;
   }
 
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 69f43b8b486ea..50cb225f6cb18 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -340,7 +340,7 @@ std::string makeResultFileName(Twine Ext, int I, StringRef Suffix) {
                              : sys::path::parent_path(OutputFilename);
   const StringRef Sep = sys::path::get_separator();
   std::string Dir = Dir0.str();
-  if (!Dir0.empty() && !Dir0.endswith(Sep))
+  if (!Dir0.empty() && !Dir0.ends_with(Sep))
     Dir += Sep.str();
   return Dir + sys::path::stem(OutputFilename).str() + Suffix.str() + "_" +
          std::to_string(I) + Ext.str();
diff --git a/sycl-fusion/common/lib/NDRangesHelper.cpp b/sycl-fusion/common/lib/NDRangesHelper.cpp
index 96f26d96a4ea5..7c418022ef4ee 100644
--- a/sycl-fusion/common/lib/NDRangesHelper.cpp
+++ b/sycl-fusion/common/lib/NDRangesHelper.cpp
@@ -170,6 +170,18 @@ jit_compiler::FusedNDRange::get(ArrayRef<NDRange> NDRanges) {
                                "Cannot fuse kernels whose fusion would "
                                "yield non-uniform work-group sizes");
     }
+
+    // Work-items in the same work-group in the original ND-ranges must be in
+    // the same work-group in the fused one.
+    if (LocalSize && any_of(NDRanges, [&Fused](const NDRange &NDR) {
+          return NDR.hasSpecificLocalSize() && requireIDRemapping(Fused, NDR);
+        })) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Cannot fuse kernels when any of the fused kernels with a specific "
+          "local size has different global sizes in dimensions [2, N) or "
+          "different number of dimensions");
+    }
   }
 
   return FusedNDRange{Fused, IsHeterogeneousList, NDRanges};
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index b83a8a26eeae0..224a1984d2902 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -212,7 +212,8 @@ These restrictions can be simplified to:
 
 - No two local sizes specified by the nd-ranges will be different;
 - No global id remapping is needed ([see](#work-item-remapping)) or all input offsets are 0;
-- All the fused nd-ranges must have the same offset.
+- All the fused nd-ranges must have the same offset;
+- No global id remapping is needed for kernels specifying a local size.
 
 As we can see, there is no restrictions in the number of dimensions or global sizes of the input nd-ranges.
 
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 4d0da9995908a..31c6a86f3300d 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -149,9 +149,11 @@
 // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones.
 // 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM
 // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM
+// 15.43 Changed the signature of piextMemGetNativeHandle to also take a
+// pi_device
 
-#define _PI_H_VERSION_MAJOR 14
-#define _PI_H_VERSION_MINOR 42
+#define _PI_H_VERSION_MAJOR 15
+#define _PI_H_VERSION_MINOR 43
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1424,8 +1426,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition(
 /// Gets the native handle of a PI mem object.
 ///
 /// \param mem is the PI mem to get the native handle of.
+/// \param dev is the PI device that the native allocation will be resident on
 /// \param nativeHandle is the native handle of mem.
-__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem,
+__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, pi_device dev,
                                                 pi_native_handle *nativeHandle);
 
 /// Creates PI mem object from a native handle.
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7af684ee99cfb..5d8f13ce82619 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -179,6 +179,45 @@ __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
   return lsc_format_ret<T>(Result);
 }
 
+/// USM pointer scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to specific address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
+          cache_hint L2H, int N, typename Toffset>
+__ESIMD_API void scatter_impl(T *p, simd<Toffset, N> offsets,
+                              simd<T, N * NElts> vals, simd_mask<N> pred) {
+  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
+  check_cache_hint<cache_action::store, L1H, L2H>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
+  __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
+                              Transposed, N>(pred.data(), addrs.data(),
+                                             Tmp.data());
+}
+
 // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter.
 // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is
 // not used because of an issue in GPU driver, which does not recognize
@@ -616,44 +655,203 @@ gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
   return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
 }
 
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-1)
+
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	PropertyListT props = {});                         // (usm-sc-2)
+
+/// The next two functions are similar to usm-sc-{1,2} with the 'byte_offsets'
+/// parameter represerented as 'simd_view'.
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///      PropertyListT props = {});                         // (usm-sc-4)
+
+/// template <typename T, int N, int VS = 1, typename OffsetT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+/// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-1)
+///
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
 /// value of the corresponding element in the input offset vector. Access to
 /// any element's memory location can be disabled via the input mask.
-/// @tparam Tx Element type, must be of size 4 or less.
-/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
-///   or \c 32.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
 /// @param p The base address.
-/// @param offsets A vector of 32-bit or 64-bit offsets in bytes. For each lane
-/// \c i,   ((byte*)p + offsets[i]) must be element size aligned.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
 /// @param vals The vector to scatter.
-/// @param mask The access mask, defaults to all 1s.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
+  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
+
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "scatter() requires at least element-size alignment");
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+
+  // Use LSC lowering if L1/L2 or VS > 1.
+  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
+                VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+    static_assert(VS == 1 || sizeof(T) >= 4,
+                  "VS > 1 is supprted only for 4- and 8-byte elements");
+    return detail::scatter_impl<T, VS, detail::lsc_data_size::default_size,
+                                L1Hint, L2Hint>(p, byte_offsets, vals, mask);
+  } else {
+    using Tx = detail::__raw_t<T>;
+    simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
+    simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
+    addrs = addrs + byte_offsets_i;
+    if constexpr (sizeof(T) == 1) {
+      simd<T, N * 4> D = __esimd_wrregion<Tx, N * 4, N, /*VS*/ 0, N, 4>(
+          D.data(), vals.data(), 0);
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
+                          detail::ElemsPerAddrEncoding<1>()>(
+          addrs.data(), D.data(), mask.data());
+    } else if constexpr (sizeof(T) == 2) {
+      simd<Tx, N * 2> D = __esimd_wrregion<Tx, N * 2, N, /*VS*/ 0, N, 2>(
+          D.data(), vals.data(), 0);
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
+                          detail::ElemsPerAddrEncoding<2>()>(
+          addrs.data(), D.data(), mask.data());
+    } else
+      __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
+                          detail::ElemsPerAddrEncoding<1>()>(
+          addrs.data(), vals.data(), mask.data());
+  }
+}
+
+// template <typename T, int N, int VS = 1, typename OffsetT,
+// 	  typename PropertyListT = empty_properties_t>
+// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+// 	PropertyListT props = {});                               // (usm-sc-2)
 ///
-template <typename Tx, int N, typename Toffset>
-__ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
-                         simd_mask<N> mask = 1) {
-  using T = detail::__raw_t<Tx>;
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
-  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
-  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
-  addrs = addrs + offsets_i;
-  if constexpr (sizeof(T) == 1) {
-    simd<T, N * 4> D;
-    D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
-                        detail::ElemsPerAddrEncoding<1>()>(
-        addrs.data(), D.data(), mask.data());
-  } else if constexpr (sizeof(T) == 2) {
-    simd<T, N * 2> D;
-    D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
-                        detail::ElemsPerAddrEncoding<2>()>(
-        addrs.data(), D.data(), mask.data());
-  } else
-    __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
-                        detail::ElemsPerAddrEncoding<1>()>(
-        addrs.data(), vals.data(), mask.data());
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
+}
+
+// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+// 	  typename PropertyListT = empty_properties_t>
+// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+// 	simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
+///
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector. Access to
+/// any element's memory location can be disabled via the input mask.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+/// 	  typename PropertyListT = empty_properties_t>
+/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///      PropertyListT props = {});                         // (usm-sc-4)
+///
+/// Writes ("scatters") elements of the input vector to different memory
+/// locations. Each memory location is base address plus an offset - a
+/// value of the corresponding element in the input offset vector.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector to scatter.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
 }
 
 /// A variation of \c scatter API with \c offsets represented as \c simd_view
@@ -671,7 +869,7 @@ __ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
 template <typename Tx, int N, typename OffsetObjT, typename RegionTy>
 __ESIMD_API void scatter(Tx *p, simd_view<OffsetObjT, RegionTy> offsets,
                          simd<Tx, N> vals, simd_mask<N> mask = 1) {
-  scatter<Tx, N>(p, offsets.read(), vals, mask);
+  scatter<Tx, N, 1>(p, offsets.read(), vals, mask);
 }
 
 /// A variation of \c scatter API with \c offsets represented as scalar.
@@ -688,7 +886,7 @@ __ESIMD_API void scatter(Tx *p, simd_view<OffsetObjT, RegionTy> offsets,
 template <typename Tx, int N, typename Toffset>
 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
-  scatter<Tx, N>(p, simd<Toffset, N>(offset), vals, mask);
+  scatter<Tx, N, 1>(p, simd<Toffset, N>(offset), vals, mask);
 }
 
 namespace detail {
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
index 774154dfc0f97..a71981de54bec 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
@@ -102,16 +102,6 @@ template <typename T> struct lsc_expand_type {
   using type = __ESIMD_DNS::lsc_expand_type<T>::type;
 };
 
-template <typename T> struct lsc_bitcast_type {
-public:
-  using type = std::conditional_t<
-      sizeof(T) == 1, uint8_t,
-      std::conditional_t<
-          sizeof(T) == 2, uint16_t,
-          std::conditional_t<sizeof(T) == 4, uint32_t,
-                             std::conditional_t<sizeof(T) == 8, uint64_t, T>>>>;
-};
-
 } // namespace detail
 
 /// L1 or L3 cache hint kinds.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 069898f14e747..7fb0ae833e793 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1434,7 +1434,7 @@ __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::nontranspose;
   using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = typename detail::lsc_bitcast_type<T>::type;
+  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
   __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
   __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
                         _ImmOffset, _DS, _VS, _Transposed, N>(
@@ -1487,25 +1487,8 @@ template <typename T, int NElts = 1,
 __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
                              __ESIMD_NS::simd<T, N * NElts> vals,
                              __ESIMD_NS::simd_mask<N> pred = 1) {
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using _CstT = typename detail::lsc_bitcast_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
-  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
-  addrs += convert<uintptr_t>(offsets);
-  __esimd_lsc_store_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
-                              _VS, _Transposed, N>(pred.data(), addrs.data(),
-                                                   Tmp.data());
+  __ESIMD_DNS::scatter_impl<T, NElts, DS, L1H, L3H, N, Toffset>(p, offsets,
+                                                                vals, pred);
 }
 
 template <typename T, int NElts = 1,
@@ -1573,7 +1556,7 @@ lsc_scatter(AccessorTy acc,
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::nontranspose;
   using MsgT = typename detail::lsc_expand_type<T>::type;
-  using _CstT = typename detail::lsc_bitcast_type<T>::type;
+  using _CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
   __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
   auto si = __ESIMD_NS::get_surface_index(acc);
   __esimd_lsc_store_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 4706639127022..b693cbaae62c8 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -2965,7 +2965,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
         Rest...);
   }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   buffer<detail::AssertHappened, 1> &getAssertHappenedBuffer();
+#endif
 
   event memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
                              bool IsDeviceImageScope, size_t NumBytes,
@@ -3019,9 +3021,7 @@ class AssertInfoCopier;
  */
 event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
                           const detail::code_location &CodeLoc) {
-  using AHBufT = buffer<detail::AssertHappened, 1>;
-
-  AHBufT &Buffer = Self.getAssertHappenedBuffer();
+  buffer<detail::AssertHappened, 1> Buffer{1};
 
   event CopierEv, CheckerEv, PostCheckerEv;
   auto CopierCGF = [&](handler &CGH) {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 4b8163b03efbd..de715de0835fd 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -17,7 +17,12 @@
 #include <sycl/detail/pi.hpp>
 
 // Forward declarations
-void enableCUDATracing();
+struct cuda_tracing_context_t_;
+
+void enableCUDATracing(cuda_tracing_context_t_ *ctx);
+void disableCUDATracing(cuda_tracing_context_t_ *ctx);
+cuda_tracing_context_t_ *createCUDATracingContext();
+void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx);
 
 //-- PI API implementation
 extern "C" {
@@ -228,8 +233,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
@@ -1237,7 +1243,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   std::memset(&(PluginInit->PiFunctionTable), 0,
               sizeof(PluginInit->PiFunctionTable));
 
-  enableCUDATracing();
+  cuda_tracing_context_t_ *Ctx = createCUDATracingContext();
+  enableCUDATracing(Ctx);
 
 // Forward calls to CUDA RT.
 #define _PI_API(api)                                                           \
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 266c72a3b3587..126ada92348f6 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -236,8 +236,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 79e047850af88..0fc36a231be6c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -243,8 +243,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp
index c7e71f9791d35..48ce104a94e90 100644
--- a/sycl/plugins/native_cpu/pi_native_cpu.cpp
+++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp
@@ -240,8 +240,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 3e7f3aea4dfed..c09be92f89406 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -222,8 +222,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                  HostPtr, RetImage);
 }
 
-pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
+                                  pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index df841b786bfb1..c19c93a6af53a 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3081,13 +3081,14 @@ inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags,
   return PI_SUCCESS;
 }
 
-inline pi_result piextMemGetNativeHandle(pi_mem Mem,
+inline pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev,
                                          pi_native_handle *NativeHandle) {
   PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
 
   ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+  ur_device_handle_t UrDev = reinterpret_cast<ur_device_handle_t>(Dev);
   ur_native_handle_t NativeMem{};
-  HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem));
+  HANDLE_ERRORS(urMemGetNativeHandle(UrMem, UrDev, &NativeMem));
 
   *NativeHandle = reinterpret_cast<pi_native_handle>(NativeMem);
 
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index adbeb652bf613..b9742b8697fa8 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -235,9 +235,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition(
                                      BufferCreateInfo, RetMem);
 }
 
-__SYCL_EXPORT pi_result
-piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+__SYCL_EXPORT pi_result piextMemGetNativeHandle(
+    pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle);
 }
 
 __SYCL_EXPORT pi_result
diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index 04c055465a9cf..835c732a40bf9 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -84,7 +84,11 @@ buffer_impl::getNativeVector(backend BackendName) const {
     }
 
     pi_native_handle Handle;
-    Plugin->call<PiApiKind::piextMemGetNativeHandle>(NativeMem, &Handle);
+    // When doing buffer interop we don't know what device the memory should be
+    // resident on, so pass nullptr for Device param. Buffer interop may not be
+    // supported by all backends.
+    Plugin->call<PiApiKind::piextMemGetNativeHandle>(NativeMem, /*Dev*/ nullptr,
+                                                     &Handle);
     Handles.push_back(Handle);
   }
 
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 0daa53587ed4d..e59fb94a09f65 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -144,7 +144,11 @@ void memBufferCreateHelper(const PluginPtr &Plugin, pi_context Ctx,
       // Always use call_nocheck here, because call may throw an exception,
       // and this lambda will be called from destructor, which in combination
       // rewards us with UB.
-      Plugin->call_nocheck<PiApiKind::piextMemGetNativeHandle>(*RetMem, &Ptr);
+      // When doing buffer interop we don't know what device the memory should
+      // be resident on, so pass nullptr for Device param. Buffer interop may
+      // not be supported by all backends.
+      Plugin->call_nocheck<PiApiKind::piextMemGetNativeHandle>(
+          *RetMem, /*Dev*/ nullptr, &Ptr);
       emitMemAllocEndTrace(MemObjID, (uintptr_t)(Ptr), Size, 0 /* guard zone */,
                            CorrID);
     }};
@@ -167,7 +171,11 @@ void memReleaseHelper(const PluginPtr &Plugin, pi_mem Mem) {
   // Do not make unnecessary PI calls without instrumentation enabled
   if (xptiTraceEnabled()) {
     pi_native_handle PtrHandle = 0;
-    Plugin->call<PiApiKind::piextMemGetNativeHandle>(Mem, &PtrHandle);
+    // When doing buffer interop we don't know what device the memory should be
+    // resident on, so pass nullptr for Device param. Buffer interop may not be
+    // supported by all backends.
+    Plugin->call<PiApiKind::piextMemGetNativeHandle>(Mem, /*Dev*/ nullptr,
+                                                     &PtrHandle);
     Ptr = (uintptr_t)(PtrHandle);
   }
 #endif
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 810f991d6667f..ddd6a71d7db80 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -108,7 +108,9 @@ class queue_impl {
              const async_handler &AsyncHandler, const property_list &PropList)
       : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
         MPropList(PropList), MHostQueue(MDevice->is_host()),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
         MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -283,7 +285,9 @@ class queue_impl {
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler)
       : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
         MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -305,7 +309,10 @@ class queue_impl {
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler, const property_list &PropList)
       : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
-        MHostQueue(false), MAssertHappenedBuffer(range<1>{1}),
+        MHostQueue(false),
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+        MAssertHappenedBuffer(range<1>{1}),
+#endif
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
@@ -673,9 +680,11 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative(int32_t &NativeHandleDesc) const;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   buffer<AssertHappened, 1> &getAssertHappenedBuffer() {
     return MAssertHappenedBuffer;
   }
+#endif
 
   void registerStreamServiceEvent(const EventImplPtr &Event) {
     std::lock_guard<std::mutex> Lock(MMutex);
@@ -918,8 +927,10 @@ class queue_impl {
   /// need to emulate it with multiple native in-order queues.
   bool MEmulateOOO = false;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   // Buffer to store assert failure descriptor
   buffer<AssertHappened, 1> MAssertHappenedBuffer;
+#endif
 
   // This event is employed for enhanced dependency tracking with in-order queue
   // Access to the event should be guarded with MMutex
diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp
index 0b0ab39199370..cd479493bbae3 100644
--- a/sycl/source/interop_handle.cpp
+++ b/sycl/source/interop_handle.cpp
@@ -34,8 +34,8 @@ pi_native_handle interop_handle::getNativeMem(detail::Requirement *Req) const {
 
   auto Plugin = MQueue->getPlugin();
   pi_native_handle Handle;
-  Plugin->call<detail::PiApiKind::piextMemGetNativeHandle>(Iter->second,
-                                                           &Handle);
+  Plugin->call<detail::PiApiKind::piextMemGetNativeHandle>(
+      Iter->second, MDevice->getHandleRef(), &Handle);
   return Handle;
 }
 
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 47b6b29b89bab..1b877a31da4e0 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -298,9 +298,11 @@ pi_native_handle queue::getNative(int32_t &NativeHandleDesc) const {
   return impl->getNative(NativeHandleDesc);
 }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 buffer<detail::AssertHappened, 1> &queue::getAssertHappenedBuffer() {
   return impl->getAssertHappenedBuffer();
 }
+#endif
 
 event queue::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
                                   bool IsDeviceImageScope, size_t NumBytes,
diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp
new file mode 100644
index 0000000000000..252d2ed9e0c49
--- /dev/null
+++ b/sycl/test-e2e/Assert/check_resource_leak.cpp
@@ -0,0 +1,32 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Device globals aren't supported on opencl:gpu yet.
+// UNSUPPORTED: opencl && gpu
+
+// TODO: Fails at JIT compilation for some reason.
+// UNSUPPORTED: hip
+#define SYCL_FALLBACK_ASSERT 1
+
+#include <sycl/sycl.hpp>
+
+// DeviceGlobalUSMMem::~DeviceGlobalUSMMem() has asserts to ensure some
+// resources have been cleaned up when it's executed. Those asserts used to fail
+// when "AssertHappened" buffer used in fallback implementation of the device
+// assert was a data member of the queue_impl.
+sycl::ext::oneapi::experimental::device_global<int32_t> dg;
+
+int main() {
+  sycl::queue q;
+  q.submit([&](sycl::handler &cgh) {
+     sycl::range<1> R{16};
+     cgh.parallel_for(sycl::nd_range<1>{R, R}, [=](sycl::nd_item<1> ndi) {
+       if (ndi.get_global_linear_id() == 0)
+         dg.get() = 42;
+       auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+       auto active = sycl::ext::oneapi::group_ballot(sg, 1);
+     });
+   }).wait();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
index 77f6a6c4122d9..83bb92eca5440 100644
--- a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // This test checks local accessor cmpxchg atomic operations.
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
index 25bce1b2b2244..4fde6446bdff1 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC SLM atomic operations.
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
new file mode 100644
index 0000000000000..270ecc4e698af
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -0,0 +1,227 @@
+//==------- scatter.hpp - DPC++ ESIMD on-device test ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#include "common.hpp"
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+template <typename T>
+bool verify(const T *Out, int N, int Size, int VS, uint32_t MaskStride,
+            bool UseMask) {
+  using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+  int NumErrors = 0;
+  int NOffsets = N / VS;
+  for (uint32_t I = 0; I < Size; I += N) { // Verify by 1 vector at once
+    for (int VSI = 0; VSI < VS; VSI++) {
+      for (int OffsetI = 0; OffsetI < NOffsets; OffsetI++) {
+        size_t OutIndex = I + VSI * NOffsets + OffsetI;
+        bool IsMaskSet = UseMask ? ((OutIndex / VS) % MaskStride == 0) : true;
+        Tuint Expected = sycl::bit_cast<Tuint>((T)OutIndex);
+        if (!UseMask || IsMaskSet)
+          Expected = sycl::bit_cast<Tuint>((T)(OutIndex * 2));
+        Tuint Computed = sycl::bit_cast<Tuint>(Out[OutIndex]);
+        if (Computed != Expected && ++NumErrors < 16) {
+          std::cout << "Out[" << OutIndex << "] = " << std::to_string(Computed)
+                    << " vs " << std::to_string(Expected) << std::endl;
+        }
+      }
+    }
+  }
+  return NumErrors == 0;
+}
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testUSM(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  uint32_t Groups = 8;
+  uint32_t Threads = 16;
+  size_t Size = Groups * Threads * N;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), Q));
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         simd<int32_t, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
+                                             VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+         simd<T, N> Vals = gather<T, N, VS>(Out, ByteOffsets);
+         Vals *= 2;
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(Out, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(Out, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(Out, ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter(Out, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter(Out, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(Out, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter(Out, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(Out, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(Out, ByteOffsetsView, ValsView);
+             }
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask);
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testUSM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  Passed &= testUSM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // // Test scatter() without passing compile-time properties argument.
+  Passed &= testUSM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{cache_hint_L1<cache_hint::streaming>,
+                        cache_hint_L2<cache_hint::uncached>,
+                        alignment<sizeof(T)>};
+    Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testUSM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testUSM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testUSM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
new file mode 100644
index 0000000000000..929d3c6fc04f7
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
@@ -0,0 +1,37 @@
+//==------- scatter_usm.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test do not use cache-hint
+// properties to not impose using DG2/PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testUSM<int8_t, TestFeatures>(Q);
+  Passed &= testUSM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testUSM<sycl::half, TestFeatures>(Q);
+  Passed &= testUSM<uint32_t, TestFeatures>(Q);
+  Passed &= testUSM<float, TestFeatures>(Q);
+  Passed &= testUSM<int64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testUSM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp
new file mode 100644
index 0000000000000..aa466795e9b06
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp
@@ -0,0 +1,38 @@
+//==------- scatter_usm_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting USM pointer
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test uses cache-hint
+// properties and requires DG2 or PVC.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testUSM<int8_t, TestFeatures>(Q);
+  Passed &= testUSM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testUSM<sycl::half, TestFeatures>(Q);
+  Passed &= testUSM<uint32_t, TestFeatures>(Q);
+  Passed &= testUSM<float, TestFeatures>(Q);
+  Passed &= testUSM<int64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testUSM<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp
new file mode 100644
index 0000000000000..1f98b2da24c33
--- /dev/null
+++ b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp
@@ -0,0 +1,136 @@
+// FIXME: the rocm include path and link path are highly platform dependent,
+// we should set this with some variable instead.
+// RUN: %{build} -o %t.out -I/opt/rocm/include -L/opt/rocm/lib -lamdhip64
+// RUN: %{run} %t.out
+// REQUIRES: hip
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+#define __HIP_PLATFORM_AMD__
+
+#include <hip/hip_runtime.h>
+
+using namespace sycl;
+using namespace sycl::access;
+
+static constexpr size_t BUFFER_SIZE = 1024;
+
+template <typename T> class Modifier;
+
+template <typename T> class Init;
+
+template <typename BufferT, typename ValueT>
+void checkBufferValues(BufferT Buffer, ValueT Value) {
+  auto Acc = Buffer.get_host_access();
+  for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+    if (Acc[Idx] != Value) {
+      std::cerr << "buffer[" << Idx << "] = " << Acc[Idx]
+                << ", expected val = " << Value << '\n';
+      exit(1);
+    }
+  }
+}
+
+template <typename DataT>
+void copy(buffer<DataT, 1> &Src, buffer<DataT, 1> &Dst, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto SrcA = Src.template get_access<mode::read>(CGH);
+    auto DstA = Dst.template get_access<mode::write>(CGH);
+
+    auto Func = [=](interop_handle IH) {
+      auto HipStream = IH.get_native_queue<backend::ext_oneapi_hip>();
+      auto SrcMem = IH.get_native_mem<backend::ext_oneapi_hip>(SrcA);
+      auto DstMem = IH.get_native_mem<backend::ext_oneapi_hip>(DstA);
+
+      if (hipMemcpyWithStream(DstMem, SrcMem, sizeof(DataT) * SrcA.get_count(),
+                              hipMemcpyDefault, HipStream) != hipSuccess) {
+        throw;
+      }
+
+      if (hipStreamSynchronize(HipStream) != hipSuccess) {
+        throw;
+      }
+
+      if (Q.get_backend() != IH.get_backend())
+        throw;
+    };
+    CGH.host_task(Func);
+  });
+}
+
+template <typename DataT> void modify(buffer<DataT, 1> &B, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc = B.template get_access<mode::read_write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] += 1; };
+
+    CGH.parallel_for<Modifier<DataT>>(Acc.get_count(), Kernel);
+  });
+}
+
+template <typename DataT, DataT B1Init, DataT B2Init>
+void init(buffer<DataT, 1> &B1, buffer<DataT, 1> &B2, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc1 = B1.template get_access<mode::write>(CGH);
+    auto Acc2 = B2.template get_access<mode::write>(CGH);
+
+    CGH.parallel_for<Init<DataT>>(BUFFER_SIZE, [=](item<1> Id) {
+      Acc1[Id] = B1Init;
+      Acc2[Id] = B2Init;
+    });
+  });
+}
+
+// Check that a single host-interop-task with a buffer will work.
+void test_ht_buffer(queue &Q) {
+  buffer<int, 1> Buffer{BUFFER_SIZE};
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer.get_access<mode::write>(CGH);
+    auto Func = [=](interop_handle IH) { /*A no-op */ };
+    CGH.host_task(Func);
+  });
+}
+
+// A test that uses HIP interop to copy data from buffer A to buffer B, by
+// getting HIP ptrs and calling the hipMemcpyWithStream. Then run a SYCL
+// kernel that modifies the data in place for B, e.g. increment one, then copy
+// back to buffer A. Run it on a loop, to ensure the dependencies and the
+// reference counting of the objects is not leaked.
+void test_ht_kernel_dependencies(queue &Q) {
+  static constexpr int COUNT = 4;
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  // Init the buffer with a'priori invalid data.
+  init<int, -1, -2>(Buffer1, Buffer2, Q);
+
+  // Repeat a couple of times.
+  for (size_t Idx = 0; Idx < COUNT; ++Idx) {
+    copy(Buffer1, Buffer2, Q);
+    modify(Buffer2, Q);
+    copy(Buffer2, Buffer1, Q);
+  }
+
+  checkBufferValues(Buffer1, COUNT - 1);
+  checkBufferValues(Buffer2, COUNT - 1);
+}
+
+void tests(queue &Q) {
+  test_ht_buffer(Q);
+  test_ht_kernel_dependencies(Q);
+}
+
+int main() {
+  queue Q([](sycl::exception_list ExceptionList) {
+    if (ExceptionList.size() != 1) {
+      std::cerr << "Should be one exception in exception list" << std::endl;
+      std::abort();
+    }
+    std::rethrow_exception(*ExceptionList.begin());
+  });
+  tests(Q);
+  std::cout << "Test PASSED" << std::endl;
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
index 709befa514915..930fcc12eff86 100644
--- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
@@ -15,14 +15,19 @@ enum class Internalization { None, Local, Private };
 
 template <typename Range> size_t getSize(Range r);
 
-template <> size_t getSize(range<1> r) { return r.size(); }
-template <> size_t getSize(nd_range<1> r) {
+template <int Dimensions> size_t getSize(range<Dimensions> r) {
+  return r.size();
+}
+template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
   return r.get_global_range().size();
 }
 
 template <typename Kernel1Name, typename Kernel2Name, typename Range1,
           typename Range2>
 void performFusion(queue &q, Range1 R1, Range2 R2) {
+  using IndexTy1 = item<Range1::dimensions>;
+  using IndexTy2 = item<Range2::dimensions>;
+
   int in[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -43,15 +48,19 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
     q.submit([&](handler &cgh) {
       auto accIn = bIn.get_access(cgh);
       auto accTmp = bTmp.get_access(cgh);
-      cgh.parallel_for<Kernel1Name>(
-          R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; });
+      cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
+        size_t j = i.get_linear_id();
+        accTmp[j] = accIn[j] + 5;
+      });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp = bTmp.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<Kernel2Name>(
-          R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
+        size_t j = i.get_linear_id();
+        accOut[j] = accTmp[j] * 2;
+      });
     });
 
     fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
@@ -117,5 +126,17 @@ int main() {
   // CHECK-NEXT: Cannot fuse kernels whose fusion would yield non-uniform work-group sizes
   // CHECK: COMPUTATION OK
 
+  // Scenario: Fusing two kernels that may lead to synchronization issues as two
+  // work-items in the same work-group may not be in the same work-group in the
+  // fused ND-range.
+  performFusion<class Kernel1_5, class Kernel2_5>(
+      q, nd_range<2>{range<2>{2, 2}, range<2>{2, 2}},
+      nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
+  // CHECK: ERROR: JIT compilation for kernel fusion failed with message:
+  // CHECK-NEXT: Illegal ND-range combination
+  // CHECK-NEXT: Detailed information:
+  // CHECK-NEXT: Cannot fuse kernels when any of the fused kernels with a specific local size has different global sizes in dimensions [2, N) or different number of dimensions
+  // CHECK: COMPUTATION OK
+
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
index fe5b110864e6b..9fd4f184692be 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
@@ -11,7 +11,7 @@ using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 constexpr float bf16_eps = 0.00390625;
 
-// Example usage of Nvidia matrix multiply.
+// Example usage of joint_matrix matrix multiply.
 // Optimizations such as memory paddings for avoiding bank conflicts are not
 // included in this test which aids clarity for what is going on. This example
 // forms a "Big matrix" corresponding to a single "TILE" using cuda example
@@ -30,37 +30,47 @@ constexpr float bf16_eps = 0.00390625;
 constexpr int N_THREADS_PER_MATRIX_OP = 32;
 
 // number of submatrices per row of accumulator ("C", "D") matrices.
-constexpr int SUB_TILES_M = 3;
+constexpr int SUB_TILES_M = 2;
 // number of submatrices per col of accumulator matrices.
 constexpr int SUB_TILES_N = 2;
 // number of submatrices per col of "A"/per row of "B", matrices.
-constexpr int SUB_TILES_K = 1;
+constexpr int SUB_TILES_K = 2;
 
-template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N>
+template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N,
+          layout layout_A, layout layout_B, layout layout_C>
 class TypeHelper;
 
-template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N>
-using KernelName = class TypeHelper<Tm, Tc, Td, M, K, N>;
+template <typename Tm, typename Tc, typename Td, size_t M, size_t K, size_t N,
+          layout layout_A, layout layout_B, layout layout_C>
+using KernelName =
+    class TypeHelper<Tm, Tc, Td, M, K, N, layout_A, layout_B, layout_C>;
 
-template <size_t Big_N, size_t Big_K, typename Tm, typename Tc>
+template <size_t Big_N, size_t Big_K, size_t Big_M, layout layout_A,
+          layout layout_B, typename Tm, typename Tc>
 Tc matrix_ref_mn(const int &m, const int &n, Tm *A, Tm *B, Tc *C) {
   Tc res = C[m * Big_N + n];
 
-  if constexpr (std::is_same<Tm, bfloat16>::value) {
-    for (int k = 0; k < Big_K; k++)
-      res += A[m * Big_K + k] * B[k * Big_N + n];
-  } else {
-    for (int k = 0; k < Big_K; k++)
-      res +=
-          static_cast<Tc>(A[m * Big_K + k]) * static_cast<Tc>(B[k * Big_N + n]);
+  for (int k = 0; k < Big_K; k++) {
+    auto index_a =
+        layout_A == layout::row_major ? m * Big_K + k : m + k * Big_M;
+    auto index_b =
+        layout_B == layout::row_major ? k * Big_N + n : k + n * Big_K;
+
+    if constexpr (std::is_same<Tm, bfloat16>::value) {
+      res += A[index_a] * B[index_b];
+    } else {
+      res += static_cast<Tc>(A[index_a]) * static_cast<Tc>(B[index_b]);
+    }
   }
 
   return res;
 }
 
-template <typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M,
-          size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N,
-          typename T3 = std::remove_const_t<Tm>>
+template <
+    typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M,
+    size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N,
+    layout layout_A = layout::row_major, layout layout_B = layout::row_major,
+    layout layout_C = layout::row_major, typename T3 = std::remove_const_t<Tm>>
 void test(queue &q) {
   // total number of M dimension matrix elements for the "Big matrix".
   constexpr auto Big_M = Sub_Tiles_M * M;
@@ -97,7 +107,8 @@ void test(queue &q) {
         accessor<bfloat16, 1, access::mode::write, target::device> accA(bufA,
                                                                         cgh);
 
-        cgh.parallel_for<KernelName<Tm, Tc, class copyA, M, K, N>>(
+        cgh.parallel_for<KernelName<Tm, Tc, class copyA, M, K, N, layout_A,
+                                    layout_B, layout_C>>(
             range<1>(Big_M * Big_K), [=](item<1> item) {
               auto i = item.get_linear_id();
               accA[i] = 0.1f * (i % 10);
@@ -107,7 +118,8 @@ void test(queue &q) {
         accessor<bfloat16, 1, access::mode::write, target::device> accB(bufB,
                                                                         cgh);
 
-        cgh.parallel_for<KernelName<Tm, Tc, class copyB, M, K, N>>(
+        cgh.parallel_for<KernelName<Tm, Tc, class copyB, M, K, N, layout_A,
+                                    layout_B, layout_C>>(
             range<1>(Big_K * Big_N), [=](item<1> item) {
               auto i = item.get_linear_id();
               accB[i] = 0.1f * (i % 10);
@@ -130,7 +142,8 @@ void test(queue &q) {
       range<2> GlobalRange = {Sub_Tiles_M,
                               Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
 
-      cgh.parallel_for<KernelName<Tm, Tc, Td, M, K, N>>(
+      cgh.parallel_for<
+          KernelName<Tm, Tc, Td, M, K, N, layout_A, layout_B, layout_C>>(
           nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
             sycl::sub_group sg = item.get_sub_group();
             // row id of current submatrix of BIG C matrix
@@ -138,33 +151,46 @@ void test(queue &q) {
             // column id of current submatrix of BIG C matrix
             const auto n = item.get_group().get_group_id()[1];
 
-            joint_matrix<sycl::sub_group, T3, use::a, M, K, layout::row_major>
-                sub_a;
-            joint_matrix<sycl::sub_group, T3, use::b, K, N, layout::row_major>
-                sub_b;
+            joint_matrix<sycl::sub_group, T3, use::a, M, K, layout_A> sub_a;
+            joint_matrix<sycl::sub_group, T3, use::b, K, N, layout_B> sub_b;
             joint_matrix<sycl::sub_group, std::remove_const_t<Tc>,
                          use::accumulator, M, N>
                 sub_c;
             joint_matrix<sycl::sub_group, Td, use::accumulator, M, N> sub_d;
+            auto stride_C = layout_C == layout::row_major ? Big_N : Big_M;
+            auto load_stride_C = layout_C == layout::row_major
+                                     ? (m * M) * Big_N + n * N
+                                     : (m * M) + n * N * Big_M;
 
             joint_matrix_load(
                 sg, sub_c,
                 accC.template get_multi_ptr<access::decorated::no>() +
-                    (m * M) * Big_N + n * N,
-                Big_N, layout::row_major);
+                    load_stride_C,
+                stride_C, layout_C);
+
+            auto stride_A = layout_A == layout::row_major ? Big_K : Big_M;
+            auto stride_B = layout_B == layout::row_major ? Big_N : Big_K;
+
             // k = row/col id of current submatrix of BIG A/B matrices
             for (int k = 0; k < Sub_Tiles_K; k++) {
+              auto load_stride_A = layout_A == layout::row_major
+                                       ? (k * K) + (m * M * Big_K)
+                                       : (k * K * Big_M) + (m * M);
+              auto load_stride_B = layout_B == layout::row_major
+                                       ? (k * K * Big_N) + (n * N)
+                                       : (k * K) + (n * N * Big_K);
+
               joint_matrix_load(
                   sg, sub_a,
                   accA.template get_multi_ptr<access::decorated::no>() +
-                      (k * K) + (m * M * Big_K),
-                  Big_K);
+                      load_stride_A,
+                  stride_A);
 
               joint_matrix_load(
                   sg, sub_b,
                   accB.template get_multi_ptr<access::decorated::no>() +
-                      (k * K * Big_N) + (n * N),
-                  Big_N);
+                      load_stride_B,
+                  stride_B);
 
               // round values to correct precision if using tf32
               if constexpr (std::is_same<T3, precision::tf32>::value) {
@@ -174,12 +200,13 @@ void test(queue &q) {
               }
 
               joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
+              joint_matrix_copy(sg, sub_d, sub_c);
             }
             joint_matrix_store(
                 sg, sub_d,
                 accD.template get_multi_ptr<access::decorated::no>() +
-                    (m * M) * Big_N + n * N,
-                Big_N, layout::row_major);
+                    load_stride_C,
+                stride_C, layout_C);
           });
     });
     q.wait();
@@ -187,14 +214,18 @@ void test(queue &q) {
 
   for (int m = 0; m < Big_M; m++) {
     for (int n = 0; n < Big_N; n++) {
+      auto index_D =
+          layout_C == layout::row_major ? m * Big_N + n : m + n * Big_M;
       if constexpr (std::is_same<std::remove_const_t<Tm>, bfloat16>::value) {
-        auto res_device = matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C);
-        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
-                   (D[m * Big_N + n] + res_device) <
+        auto res_device =
+            matrix_ref_mn<Big_N, Big_K, Big_M, layout_A, layout_B>(m, n, A, B,
+                                                                   C);
+        assert(fabs(2 * (D[index_D] - res_device)) / (D[index_D] + res_device) <
                bf16_eps * 2);
       } else {
-        assert(
-            (D[m * Big_N + n] == matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C)));
+        assert((D[index_D] ==
+                matrix_ref_mn<Big_N, Big_K, Big_M, layout_A, layout_B>(m, n, A,
+                                                                       B, C)));
       }
     }
   }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
index f28372b6277dc..a558600ad390c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
@@ -80,12 +80,23 @@ int main() {
     test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
          32, 16, 8>(Q);
 
+    // test different layout combinations for one case
+
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::row_major, layout::row_major, layout::col_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::row_major, layout::col_major, layout::row_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::col_major, layout::row_major, layout::row_major>(Q);
+    test<const half, const half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8, layout::col_major, layout::col_major, layout::row_major>(Q);
+
+    // joint_matrix_apply tests
+
     auto apply_add = [](auto &x) { x = x + 2; };
     float D[MATRIX_M][MATRIX_N];
     big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
 
-    // joint_matrix_apply tests
-
     matrix_verify_lambda<half, float, M, 16, N>(Q, MD_f, 0.0, apply_add);
   }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
index cea15392408cc..1dea8c879b5eb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
@@ -50,13 +50,28 @@ int main() {
     test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
          SUB_TILES_N, 32, 16, 8>(Q);
 
+    // test different layout combinations for one case
+
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::row_major, layout::row_major,
+         layout::col_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::col_major, layout::row_major,
+         layout::row_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::row_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const uint8_t, const int32_t, int32_t, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 32, 16, 8, layout::col_major, layout::col_major,
+         layout::row_major>(Q);
+
+    // joint_matrix_apply tests
+
     auto apply_add = [](auto &x) { x = x + 2; };
 
     int32_t D_i[MATRIX_M][MATRIX_N];
     big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
 
-    // joint_matrix_apply tests
-
     matrix_verify_lambda<uint8_t, int32_t, M, 16, N>(Q, MD_i, 0, apply_add);
     matrix_verify_lambda<int8_t, int32_t, M, 16, N>(Q, MD_i, 0, apply_add);
   }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
index 2a0731d9b988e..ca823161b6197 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
@@ -43,9 +43,28 @@ int main() {
 
     // A/B tf32
     test<float, float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
+         layout::row_major, layout::row_major, layout::row_major,
          precision::tf32>(Q);
     test<const float, const float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
-         16, 8, 16, precision::tf32>(Q);
+         16, 8, 16, layout::row_major, layout::row_major, layout::row_major,
+         precision::tf32>(Q);
+
+    // test different layout combinations for one case
+
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::row_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::row_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::col_major,
+         layout::row_major>(Q);
+    test<const bfloat16, const float, float, SUB_TILES_M, SUB_TILES_K,
+         SUB_TILES_N, 8, 16, 32, layout::col_major, layout::col_major,
+         layout::col_major>(Q);
+
+    // joint_matrix_apply tests
 
     float D[MATRIX_M][MATRIX_N];
     big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
@@ -54,7 +73,6 @@ int main() {
     big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
     auto apply_add = [](auto &x) { x = x + 2; };
 
-    // joint_matrix_apply tests
     matrix_verify_lambda<bfloat16, float, 16, 16, 16>(Q, MD_f, 0.0, apply_add);
 
     matrix_verify_lambda<double, double, 8, 4, 8>(Q, MD_d, -60.0, apply_add);
diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp
index 4a9d16ec5a34e..2f2d4009dce1e 100644
--- a/sycl/test-e2e/USM/usm_pooling.cpp
+++ b/sycl/test-e2e/USM/usm_pooling.cpp
@@ -1,6 +1,9 @@
 // REQUIRES: level_zero
 // RUN: %{build} -o %t.out
 
+// https://github.com/intel/llvm/issues/12397
+// UNSUPPORTED: gpu-intel-dg2
+
 // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.
 
 // With no pooling: 1,2,3,4,5 allocs lead to ZE call.
diff --git a/sycl/test/check_device_code/fp-accuracy.cpp b/sycl/test/check_device_code/fp-accuracy.cpp
new file mode 100644
index 0000000000000..f5a42c2bbc436
--- /dev/null
+++ b/sycl/test/check_device_code/fp-accuracy.cpp
@@ -0,0 +1,29 @@
+// DEFINE: %{common_opts} = -fsycl -fsycl-device-only -fno-math-errno \
+// DEFINE: -ffp-accuracy=high -S -emit-llvm -o - %s
+
+// RUN: %clangxx %{common_opts} | FileCheck %s
+
+// RUN: %clangxx %{common_opts} -ffp-accuracy=low:exp \
+// RUN: | FileCheck %s --check-prefix=CHECK-LOW-EXP
+
+#include <sycl/sycl.hpp>
+
+SYCL_EXTERNAL auto foo(double x) {
+  using namespace sycl;
+  return cos(exp(log(x)));
+}
+
+// CHECK-LABEL: define {{.*}}food
+// CHECK: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_HIGH:[0-9]+]]
+// CHECK: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]]
+
+// CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
+
+// CHECK-LOW-EXP-LABEL: define {{.*}}food
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]]
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]]
+// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]]
+
+// CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0"
+// CHECK-F1: attributes #[[ATTR_F1_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0"
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 737d7b4fabfad..31dbc3e889f4b 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -969,6 +969,8 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
   simd<float, 32> pass_thru;
   auto pass_thru_view = pass_thru.select<32, 1>();
 
+  auto usm_view = usm.select<32, 1>();
+
   // Test USM and ACC gather using this plan:
   // 1) gather(usm, offsets): offsets is simd or simd_view
   // 2) gather(usm, offsets, mask): offsets is simd or simd_view
@@ -1151,6 +1153,64 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
                                  props_align4);
   acc_res = gather<float, 32, 2>(acc, ioffset_n16_view, mask_n16,
                                  pass_thru_view, props_align4);
+
+  // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
+  scatter(ptrf, ioffset_n32, usm, mask_n32);
+
+  scatter(ptrf, ioffset_n32, usm);
+
+  scatter(ptrf, ioffset_n32, usm, mask_n32, props_align4);
+
+  scatter(ptrf, ioffset_n32, usm, props_align4);
+
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v32i1.v32i64.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter(ptrf, ioffset_n32, usm, mask_n32, props_cache_load);
+  scatter(ptrf, ioffset_n32, usm, props_cache_load);
+
+  scatter(ptrf, ioffset_n32_view, usm, mask_n32, props_cache_load);
+  scatter(ptrf, ioffset_n32_view, usm, props_cache_load);
+
+  scatter<float, 32>(ptrf, ioffset_n32, usm_view, mask_n32, props_cache_load);
+  scatter<float, 32>(ptrf, ioffset_n32, usm_view, props_cache_load);
+
+  scatter<float, 32>(ptrf, ioffset_n32_view, usm_view, mask_n32,
+                     props_cache_load);
+  scatter<float, 32>(ptrf, ioffset_n32_view, usm_view, props_cache_load);
+
+  // VS > 1
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, mask_n16, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, props_cache_load);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, props_cache_load);
+
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16, usm_view);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(ptrf, ioffset_n16_view, usm_view);
 }
 
 // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index c1f0a58f82274..31eac5598f588 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -613,7 +613,7 @@ mock_piMemBufferPartition(pi_mem buffer, pi_mem_flags flags,
   return PI_SUCCESS;
 }
 
-inline pi_result mock_piextMemGetNativeHandle(pi_mem mem,
+inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, pi_device dev,
                                               pi_native_handle *nativeHandle) {
   *nativeHandle = reinterpret_cast<pi_native_handle>(mem);
   return PI_SUCCESS;