From cf829e065ebfeeaa87d15cb907ee57f8568af7d8 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 2 Feb 2024 12:15:25 +0000
Subject: [PATCH 1/9] [syclcompat][CUDA] FIX UB in test / seq_cst requires
 sm_70 on CUDA (#12575)

Fix UB in test by using a single thread task.

A shared USM variable was being simultaneously written to by multiple
threads without using atomics. AFAIK this is generally not a well
defined program, and was leading to invalid values.

seq_cst also requires sm_70 on CUDA, so the compilation invocation is
updated to reflect this.
The CI device is >= sm_70 so it can use seq_cst. However this test did
not compile for >=sm_70.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/test-e2e/syclcompat/atomic/atomic_class.cpp          | 6 +++---
 sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
index 3c6b95c1eb4af..d004a37bcc610 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
@@ -32,7 +32,7 @@
 
 // UNSUPPORTED: hip || (windows && level_zero)
 
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_70 %} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/sycl.hpp>
@@ -41,8 +41,8 @@
 #include "../common.hpp"
 #include "atomic_fixt.hpp"
 
-constexpr size_t numBlocks = 64;
-constexpr size_t numThreads = 256;
+constexpr size_t numBlocks = 1;
+constexpr size_t numThreads = 1;
 constexpr size_t numData = 6;
 
 template <typename T, typename AtomicType>
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
index 6b31bcc626ee0..4ccc67fbff53e 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
@@ -32,7 +32,7 @@
 
 // UNSUPPORTED: hip
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_70 %} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <iostream>

From d3d6e789cd420e88648580c160785bf15200bcc8 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Fri, 2 Feb 2024 07:36:49 -0800
Subject: [PATCH 2/9] [SYCL][E2E] Disable
 assert_in_multiple_tus[_one_ndebug].cpp on cuda (#12584)

See https://github.com/intel/llvm/issues/8832, the test is flaky in
post-commit/nightly.
---
 sycl/test-e2e/Assert/assert_in_multiple_tus.cpp            | 3 +++
 sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
index a1d40ad0489a1..062f5952236ad 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
@@ -3,6 +3,9 @@
 // https://github.com/intel/llvm/issues/7634
 // UNSUPPORTED: hip
 //
+// https://github.com/intel/llvm/issues/8832
+// UNSUPPORTED: cuda
+//
 // FIXME: Remove XFAIL one intel/llvm#11364 is resolved
 // XFAIL: (opencl && gpu)
 
diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
index e5db927f33e11..1bf2ecdc98418 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
@@ -3,6 +3,9 @@
 // https://github.com/intel/llvm/issues/7634
 // UNSUPPORTED: hip
 //
+// https://github.com/intel/llvm/issues/8832
+// UNSUPPORTED: cuda
+//
 // FIXME: Remove XFAIL one intel/llvm#11364 is resolved
 // XFAIL: (opencl && gpu)
 

From 153ccbe4c38fc7392c2aff06d82c3f6666f57248 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Fri, 2 Feb 2024 12:12:26 -0800
Subject: [PATCH 3/9] [NFCI][SYCL] Refactor getBinaryImageFormat (#12586)

A future PR will add support for magic numbers other than four bytes.
Refactor the code to make those future changes easier to review.
---
 sycl/source/detail/pi.cpp | 70 ++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 38 deletions(-)

diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index 92bccc2cea2b2..4dbeaccc9baf2 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -685,45 +685,39 @@ static uint16_t getELFHeaderType(const unsigned char *ImgData, size_t ImgSize) {
 sycl::detail::pi::PiDeviceBinaryType
 getBinaryImageFormat(const unsigned char *ImgData, size_t ImgSize) {
   // Top-level magic numbers for the recognized binary image formats.
-  struct {
-    sycl::detail::pi::PiDeviceBinaryType Fmt;
-    const uint32_t Magic;
-  } Fmts[] = {{PI_DEVICE_BINARY_TYPE_SPIRV, 0x07230203},
-              {PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE, 0xDEC04342},
-              // 'I', 'N', 'T', 'C' ; Intel native
-              {PI_DEVICE_BINARY_TYPE_NATIVE, 0x43544E49}};
-
-  if (ImgSize >= sizeof(Fmts[0].Magic)) {
-    std::remove_const_t<decltype(Fmts[0].Magic)> Hdr = 0;
-    std::copy(ImgData, ImgData + sizeof(Hdr), reinterpret_cast<char *>(&Hdr));
-
-    // Check headers for direct formats.
-    for (const auto &Fmt : Fmts) {
-      if (Hdr == Fmt.Magic)
-        return Fmt.Fmt;
-    }
-
-    // ELF e_type for recognized binary image formats.
-    struct {
-      sycl::detail::pi::PiDeviceBinaryType Fmt;
-      const uint16_t Magic;
-    } ELFFmts[] = {{PI_DEVICE_BINARY_TYPE_NATIVE, 0xFF04},  // OpenCL executable
-                   {PI_DEVICE_BINARY_TYPE_NATIVE, 0xFF12}}; // ZEBIN executable
-
-    // ELF files need to be parsed separately. The header type ends after 18
-    // bytes.
-    if (Hdr == 0x464c457F && ImgSize >= 18) {
-      uint16_t HdrType = getELFHeaderType(ImgData, ImgSize);
-      for (const auto &ELFFmt : ELFFmts) {
-        if (HdrType == ELFFmt.Magic)
-          return ELFFmt.Fmt;
-      }
-      // Newer ZEBIN format does not have a special header type, but can instead
-      // be identified by having a required .ze_info section.
-      if (checkELFSectionPresent(".ze_info", ImgData, ImgSize))
-        return PI_DEVICE_BINARY_TYPE_NATIVE;
-    }
+  auto MatchMagicNumber = [&](auto Number) {
+    return ImgSize >= sizeof(Number) &&
+           std::memcmp(ImgData, &Number, sizeof(Number)) == 0;
+  };
+
+  if (MatchMagicNumber(uint32_t{0x07230203}))
+    return PI_DEVICE_BINARY_TYPE_SPIRV;
+
+  if (MatchMagicNumber(uint32_t{0xDEC04342}))
+    return PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE;
+
+  if (MatchMagicNumber(uint32_t{0x43544E49}))
+    // 'I', 'N', 'T', 'C' ; Intel native
+    return PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE;
+
+  // Check for ELF format, size requirements include data we'll read in case of
+  // succesful match.
+  if (ImgSize >= 18 && MatchMagicNumber(uint32_t{0x464c457F})) {
+    uint16_t ELFHdrType = getELFHeaderType(ImgData, ImgSize);
+    if (ELFHdrType == 0xFF04)
+      // OpenCL executable.
+      return PI_DEVICE_BINARY_TYPE_NATIVE;
+
+    if (ELFHdrType == 0xFF12)
+      // ZEBIN executable.
+      return PI_DEVICE_BINARY_TYPE_NATIVE;
+
+    // Newer ZEBIN format does not have a special header type, but can instead
+    // be identified by having a required .ze_info section.
+    if (checkELFSectionPresent(".ze_info", ImgData, ImgSize))
+      return PI_DEVICE_BINARY_TYPE_NATIVE;
   }
+
   return PI_DEVICE_BINARY_TYPE_NONE;
 }
 

From f331ba2063eb80311818760eee74b58473ecb107 Mon Sep 17 00:00:00 2001
From: Yang Zhao <yang2.zhao@intel.com>
Date: Mon, 5 Feb 2024 16:04:19 +0800
Subject: [PATCH 4/9] [DeviceSanitizer] Support CPU Device & Static Local
 Memory (#12248)

UR: https://github.com/oneapi-src/unified-runtime/pull/1210

---------

Co-authored-by: Maosu Zhao <maosu.zhao@intel.com>
---
 libdevice/atomic.hpp                          |  21 +
 libdevice/cmake/modules/SYCLLibdevice.cmake   |   6 +-
 libdevice/include/device-sanitizer-report.hpp |  58 ++
 libdevice/include/sanitizer_device_utils.hpp  |  49 ++
 libdevice/include/spir_global_var.hpp         |  26 +
 libdevice/sanitizer_utils.cpp                 | 527 +++++++++++++++++-
 libdevice/spirv_vars.h                        |   1 +
 .../Instrumentation/AddressSanitizer.cpp      | 245 +++++++-
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |   4 +
 .../Instrumentation/AddressSanitizer/spir.ll  | 156 ++++++
 llvm/tools/sycl-post-link/sycl-post-link.cpp  |   5 +-
 sycl/plugins/unified_runtime/CMakeLists.txt   |   7 +
 .../out-of-bounds/USM/parallel_for_char.cpp   |  40 ++
 .../out-of-bounds/USM/parallel_for_double.cpp |  40 ++
 .../out-of-bounds/USM/parallel_for_func.cpp   |  42 ++
 .../out-of-bounds/USM/parallel_for_int.cpp    |  40 ++
 .../out-of-bounds/USM/parallel_for_short.cpp  |  40 ++
 .../out-of-bounds/local/local-overflow-1.cpp  |  29 +
 sycl/test-e2e/lit.cfg.py                      |   4 +
 19 files changed, 1304 insertions(+), 36 deletions(-)
 create mode 100644 libdevice/include/device-sanitizer-report.hpp
 create mode 100644 libdevice/include/sanitizer_device_utils.hpp
 create mode 100644 libdevice/include/spir_global_var.hpp
 create mode 100644 llvm/test/Instrumentation/AddressSanitizer/spir.ll
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_char.cpp
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_double.cpp
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_func.cpp
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_int.cpp
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_short.cpp
 create mode 100644 sycl/test-e2e/AddressSanitizer/out-of-bounds/local/local-overflow-1.cpp

diff --git a/libdevice/atomic.hpp b/libdevice/atomic.hpp
index 429792f94eb1d..3b6d1cf71f441 100644
--- a/libdevice/atomic.hpp
+++ b/libdevice/atomic.hpp
@@ -62,6 +62,11 @@ __spirv_AtomicCompareExchange(int SPIR_GLOBAL *, __spv::Scope::Flag,
                               __spv::MemorySemanticsMask::Flag,
                               __spv::MemorySemanticsMask::Flag, int, int);
 
+extern DEVICE_EXTERNAL int
+__spirv_AtomicCompareExchange(int *, __spv::Scope::Flag,
+                              __spv::MemorySemanticsMask::Flag,
+                              __spv::MemorySemanticsMask::Flag, int, int);
+
 extern DEVICE_EXTERNAL int __spirv_AtomicLoad(const int SPIR_GLOBAL *,
                                               __spv::Scope::Flag,
                                               __spv::MemorySemanticsMask::Flag);
@@ -70,6 +75,10 @@ extern DEVICE_EXTERNAL void
 __spirv_AtomicStore(int SPIR_GLOBAL *, __spv::Scope::Flag,
                     __spv::MemorySemanticsMask::Flag, int);
 
+extern DEVICE_EXTERNAL void
+__spirv_AtomicStore(int *, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag,
+                    int);
+
 /// Atomically set the value in *Ptr with Desired if and only if it is Expected
 /// Return the value which already was in *Ptr
 static inline int atomicCompareAndSet(SPIR_GLOBAL int *Ptr, int Desired,
@@ -80,6 +89,13 @@ static inline int atomicCompareAndSet(SPIR_GLOBAL int *Ptr, int Desired,
       __spv::MemorySemanticsMask::SequentiallyConsistent, Desired, Expected);
 }
 
+static inline int atomicCompareAndSet(int *Ptr, int Desired, int Expected) {
+  return __spirv_AtomicCompareExchange(
+      Ptr, __spv::Scope::Device,
+      __spv::MemorySemanticsMask::SequentiallyConsistent,
+      __spv::MemorySemanticsMask::SequentiallyConsistent, Desired, Expected);
+}
+
 static inline int atomicLoad(SPIR_GLOBAL int *Ptr) {
   return __spirv_AtomicLoad(Ptr, __spv::Scope::Device,
                             __spv::MemorySemanticsMask::SequentiallyConsistent);
@@ -90,4 +106,9 @@ static inline void atomicStore(SPIR_GLOBAL int *Ptr, int V) {
                       __spv::MemorySemanticsMask::SequentiallyConsistent, V);
 }
 
+static inline void atomicStore(int *Ptr, int V) {
+  __spirv_AtomicStore(Ptr, __spv::Scope::Device,
+                      __spv::MemorySemanticsMask::SequentiallyConsistent, V);
+}
+
 #endif // __SPIR__
diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index f8477c7b5d2d0..1d2e1b4de64f5 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -109,7 +109,7 @@ set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp imf_rounding_op.hpp im
 set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler)
 set(bfloat16_obj_deps sycl-headers sycl-compiler)
 if (NOT MSVC)
-  set(sanitizer_obj_deps device.h sycl-compiler)
+  set(sanitizer_obj_deps device.h atomic.hpp spirv_vars.h include/sanitizer_device_utils.hpp include/spir_global_var.hpp sycl-compiler)
 endif()
 
 add_devicelib_obj(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps})
@@ -126,9 +126,9 @@ add_devicelib_obj(libsycl-imf-fp64 SRC imf_wrapper_fp64.cpp DEP ${imf_obj_deps})
 add_devicelib_obj(libsycl-imf-bf16 SRC imf_wrapper_bf16.cpp DEP ${imf_obj_deps})
 add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
 if(MSVC)
-add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps})
+  add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps})
 else()
-add_devicelib_obj(libsycl-sanitizer SRC sanitizer_utils.cpp DEP ${sanitizer_obj_deps})
+  add_devicelib_obj(libsycl-sanitizer SRC sanitizer_utils.cpp DEP ${sanitizer_obj_deps} EXTRA_ARGS -fno-sycl-instrument-device-code)
 endif()
 
 add_fallback_devicelib(libsycl-fallback-cassert SRC fallback-cassert.cpp DEP ${crt_obj_deps} EXTRA_ARGS -fno-sycl-instrument-device-code)
diff --git a/libdevice/include/device-sanitizer-report.hpp b/libdevice/include/device-sanitizer-report.hpp
new file mode 100644
index 0000000000000..477fac6d4a5d4
--- /dev/null
+++ b/libdevice/include/device-sanitizer-report.hpp
@@ -0,0 +1,58 @@
+//==-- device-sanitizer-report.hpp - Structure and declaration for assert
+// support --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+// Treat this header as system one to workaround frontend's restriction
+#pragma clang system_header
+
+#include <cinttypes>
+
+enum class DeviceSanitizerErrorType : int32_t {
+  UNKNOWN,
+  OUT_OF_BOUND,
+  MISALIGNED,
+  USE_AFTER_FREE,
+  OUT_OF_SHADOW_BOUND,
+};
+
+enum class DeviceSanitizerMemoryType : int32_t {
+  UNKNOWN,
+  USM_DEVICE,
+  USM_HOST,
+  USM_SHARED,
+  LOCAL,
+  PRIVATE,
+  MEM_BUFFER,
+};
+
+// NOTE Layout of this structure should be aligned with the one in
+// sycl/include/sycl/detail/device_sanitizer_report.hpp
+struct DeviceSanitizerReport {
+  int Flag = 0;
+
+  char File[256 + 1] = "";
+  char Func[256 + 1] = "";
+
+  int32_t Line = 0;
+
+  uint64_t GID0 = 0;
+  uint64_t GID1 = 0;
+  uint64_t GID2 = 0;
+
+  uint64_t LID0 = 0;
+  uint64_t LID1 = 0;
+  uint64_t LID2 = 0;
+
+  bool IsWrite = false;
+  uint32_t AccessSize = 0;
+  DeviceSanitizerMemoryType MemoryType = DeviceSanitizerMemoryType::UNKNOWN;
+  DeviceSanitizerErrorType ErrorType = DeviceSanitizerErrorType::UNKNOWN;
+
+  bool IsRecover = false;
+};
diff --git a/libdevice/include/sanitizer_device_utils.hpp b/libdevice/include/sanitizer_device_utils.hpp
new file mode 100644
index 0000000000000..ab472d2139abc
--- /dev/null
+++ b/libdevice/include/sanitizer_device_utils.hpp
@@ -0,0 +1,49 @@
+//==-- sanitizer_device_utils.hpp - Declaration for sanitizer global var ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "spir_global_var.hpp"
+#include <cstdint>
+
+// Treat this header as system one to workaround frontend's restriction
+#pragma clang system_header
+
+template <typename T>
+class
+#ifdef __SYCL_DEVICE_ONLY__
+    [[__sycl_detail__::global_variable_allowed, __sycl_detail__::device_global,
+      __sycl_detail__::add_ir_attributes_global_variable(
+          "sycl-device-global-size", "sycl-device-image-scope", sizeof(T),
+          nullptr)]]
+#endif
+    DeviceGlobal {
+public:
+  DeviceGlobal() = default;
+  DeviceGlobal(const DeviceGlobal &) = delete;
+  DeviceGlobal(const DeviceGlobal &&) = delete;
+  DeviceGlobal &operator=(const DeviceGlobal &) = delete;
+  DeviceGlobal &operator=(const DeviceGlobal &&) = delete;
+
+  DeviceGlobal &operator=(const T newValue) noexcept {
+    val = newValue;
+    return *this;
+  }
+
+  operator T &() noexcept { return val; }
+
+  operator const T &() const noexcept { return val; }
+
+  T &get() noexcept { return val; }
+
+  const T &get() const noexcept { return val; }
+
+private:
+  T val;
+};
+
+enum DeviceType : uintptr_t { UNKNOWN, CPU, GPU_PVC, GPU_DG2 };
diff --git a/libdevice/include/spir_global_var.hpp b/libdevice/include/spir_global_var.hpp
new file mode 100644
index 0000000000000..4bf1cb8daefae
--- /dev/null
+++ b/libdevice/include/spir_global_var.hpp
@@ -0,0 +1,26 @@
+//==- spir_global_var.hpp - Declaration for device global variable support -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// Treat this header as system one to workaround frontend's restriction
+#pragma clang system_header
+
+#ifndef SPIR_GLOBAL_VAR
+#ifdef __SYCL_DEVICE_ONLY__
+#define SPIR_GLOBAL_VAR __attribute__((sycl_global_var))
+#else
+#warning "SPIR_GLOBAL_VAR not defined in host mode. Defining as empty macro."
+#define SPIR_GLOBAL_VAR
+#endif
+#endif
+
+#define __SYCL_GLOBAL__ __attribute__((opencl_global))
+#define __SYCL_LOCAL__ __attribute__((opencl_local))
+#define __SYCL_PRIVATE__ __attribute__((opencl_private))
+#define __SYCL_CONSTANT__ __attribute__((opencl_constant))
diff --git a/libdevice/sanitizer_utils.cpp b/libdevice/sanitizer_utils.cpp
index c0d4cbba112c3..3e4fd11ec811c 100644
--- a/libdevice/sanitizer_utils.cpp
+++ b/libdevice/sanitizer_utils.cpp
@@ -6,22 +6,525 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "atomic.hpp"
 #include "device.h"
+#include "spirv_vars.h"
+
+#include "include/device-sanitizer-report.hpp"
+#include "include/sanitizer_device_utils.hpp"
+
+#include <cstddef>
 #include <cstdint>
 
-using uptr = uint64_t;
+using uptr = uintptr_t;
+using s8 = char;
+using u8 = unsigned char;
+using s16 = short;
+using u16 = unsigned short;
+
+#define ASAN_SHADOW_SCALE 3
+#define ASAN_SHADOW_GRANULARITY (1ULL << ASAN_SHADOW_SCALE)
+
+DeviceGlobal<uptr> __AsanShadowMemoryGlobalStart;
+DeviceGlobal<uptr> __AsanShadowMemoryGlobalEnd;
+DeviceGlobal<uptr> __AsanShadowMemoryLocalStart;
+DeviceGlobal<uptr> __AsanShadowMemoryLocalEnd;
+
+DeviceGlobal<DeviceSanitizerReport> __DeviceSanitizerReportMem;
+
+DeviceGlobal<DeviceType> __DeviceType;
+
 #if defined(__SPIR__)
-// TODO: add real implementation in __asan_load_n.
-DEVICE_EXTERN_C_NOINLINE
-void __asan_load_n(uptr addr, unsigned n) {
-  (void)addr;
-  (void)n;
-  return;
+
+#ifdef __SYCL_DEVICE_ONLY__
+extern SYCL_EXTERNAL int
+__spirv_ocl_printf(const __SYCL_CONSTANT__ char *Format, ...);
+
+extern SYCL_EXTERNAL __SYCL_GLOBAL__ void *
+__spirv_GenericCastToPtrExplicit_ToGlobal(void *, int);
+extern SYCL_EXTERNAL __SYCL_LOCAL__ void *
+__spirv_GenericCastToPtrExplicit_ToLocal(void *, int);
+extern SYCL_EXTERNAL __SYCL_PRIVATE__ void *
+__spirv_GenericCastToPtrExplicit_ToPrivate(void *, int);
+#endif
+
+// These magic values are written to shadow for better error
+// reporting.
+const int kUsmDeviceRedzoneMagic = (char)0x81;
+const int kUsmHostRedzoneMagic = (char)0x82;
+const int kUsmSharedRedzoneMagic = (char)0x83;
+const int kMemBufferRedzoneMagic = (char)0x84;
+
+const int kUsmDeviceDeallocatedMagic = (char)0x91;
+const int kUsmHostDeallocatedMagic = (char)0x92;
+const int kUsmSharedDeallocatedMagic = (char)0x93;
+
+const int kSharedLocalRedzoneMagic = (char)0xa1;
+
+// Same with Asan Stack
+const int kPrivateLeftRedzoneMagic = (char)0xf1;
+const int kPrivateMidRedzoneMagic = (char)0xf2;
+const int kPrivateRightRedzoneMagic = (char)0xf3;
+
+static const __SYCL_CONSTANT__ char __asan_shadow_value_start[] =
+    "%p(%d) -> %p:";
+static const __SYCL_CONSTANT__ char __asan_shadow_value[] = " %02X";
+static const __SYCL_CONSTANT__ char __asan_current_shadow_value[] = ">%02X";
+static const __SYCL_CONSTANT__ char __newline[] = "\n";
+
+static const __SYCL_CONSTANT__ char __global_shadow_out_of_bound[] =
+    "ERROR: Global shadow memory out-of-bound (ptr: %p -> %p, base: %p)\n";
+static const __SYCL_CONSTANT__ char __local_shadow_out_of_bound[] =
+    "ERROR: Local shadow memory out-of-bound (ptr: %p -> %p, wg: %d, base: "
+    "%p)\n";
+
+static const __SYCL_CONSTANT__ char __unsupport_device_type[] =
+    "ERROR: Unsupport device type: %d\n";
+
+#define ASAN_REPORT_NONE 0
+#define ASAN_REPORT_START 1
+#define ASAN_REPORT_FINISH 2
+
+#define AS_PRIVATE 0
+#define AS_GLOBAL 1
+#define AS_CONSTANT 2
+#define AS_LOCAL 3
+#define AS_GENERIC 4
+
+namespace {
+
+__SYCL_GLOBAL__ void *ToGlobal(void *ptr) {
+  return __spirv_GenericCastToPtrExplicit_ToGlobal(ptr, 5);
+}
+__SYCL_LOCAL__ void *ToLocal(void *ptr) {
+  return __spirv_GenericCastToPtrExplicit_ToLocal(ptr, 4);
+}
+__SYCL_PRIVATE__ void *ToPrivate(void *ptr) {
+  return __spirv_GenericCastToPtrExplicit_ToPrivate(ptr, 7);
+}
+
+inline uptr MemToShadow_CPU(uptr addr, int32_t as) {
+  return __AsanShadowMemoryGlobalStart + (addr >> 3);
+}
+
+inline uptr MemToShadow_DG2(uptr addr, int32_t as) {
+  uptr shadow_ptr = 0;
+  if (addr & (~0xffffffffffff)) {
+    shadow_ptr =
+        (((addr & 0xffffffffffff) >> 3) + __AsanShadowMemoryGlobalStart) |
+        (~0xffffffffffff);
+  } else {
+    shadow_ptr = (addr >> 3) + __AsanShadowMemoryGlobalStart;
+  }
+
+  if (shadow_ptr > __AsanShadowMemoryGlobalEnd) {
+    __spirv_ocl_printf(__global_shadow_out_of_bound, addr, shadow_ptr);
+  }
+
+  return shadow_ptr;
+}
+
+inline uptr MemToShadow_PVC(uptr addr, int32_t as) {
+  uptr shadow_ptr = 0;
+
+  if (as == AS_GENERIC) {
+    if ((shadow_ptr = (uptr)ToGlobal((void *)addr))) {
+      as = AS_GLOBAL;
+    } else if ((shadow_ptr = (uptr)ToPrivate((void *)addr))) {
+      as = AS_PRIVATE;
+    } else if ((shadow_ptr = (uptr)ToLocal((void *)addr))) {
+      as = AS_LOCAL;
+    } else {
+      return 0;
+    }
+  }
+
+  if (as == AS_PRIVATE) {            // private
+  } else if (as == AS_GLOBAL) {      // global
+    if (addr & 0xFF00000000000000) { // Device USM
+      shadow_ptr = __AsanShadowMemoryGlobalStart + 0x200000000000 +
+                   ((addr & 0xFFFFFFFFFFFF) >> 3);
+    } else { // Only consider 47bit VA
+      shadow_ptr =
+          __AsanShadowMemoryGlobalStart + ((addr & 0x7FFFFFFFFFFF) >> 3);
+    }
+
+    if (shadow_ptr > __AsanShadowMemoryGlobalEnd) {
+      __spirv_ocl_printf(__global_shadow_out_of_bound, addr, shadow_ptr,
+                         (uptr)__AsanShadowMemoryGlobalStart);
+      shadow_ptr = 0;
+    }
+  } else if (as == AS_CONSTANT) { // constant
+  } else if (as == AS_LOCAL) {    // local
+    // The size of SLM is 128KB on PVC
+    constexpr unsigned slm_size = 128 * 1024;
+    const auto wg_lid =
+        __spirv_BuiltInWorkgroupId.x * __spirv_BuiltInNumWorkgroups.y *
+            __spirv_BuiltInNumWorkgroups.z +
+        __spirv_BuiltInWorkgroupId.y * __spirv_BuiltInNumWorkgroups.z +
+        __spirv_BuiltInWorkgroupId.z;
+
+    shadow_ptr = __AsanShadowMemoryLocalStart + ((wg_lid * slm_size) >> 3) +
+                 ((addr & (slm_size - 1)) >> 3);
+
+    if (shadow_ptr > __AsanShadowMemoryLocalEnd) {
+      __spirv_ocl_printf(__local_shadow_out_of_bound, addr, shadow_ptr, wg_lid,
+                         (uptr)__AsanShadowMemoryLocalStart);
+      shadow_ptr = 0;
+    }
+  }
+
+  return shadow_ptr;
+}
+
+inline uptr MemToShadow(uptr addr, int32_t as) {
+  uptr shadow_ptr = 0;
+
+  if (__DeviceType == DeviceType::CPU) {
+    shadow_ptr = MemToShadow_CPU(addr, as);
+  } else if (__DeviceType == DeviceType::GPU_PVC) {
+    shadow_ptr = MemToShadow_PVC(addr, as);
+  } else {
+    __spirv_ocl_printf(__unsupport_device_type, (int)__DeviceType);
+    return shadow_ptr;
+  }
+
+  return shadow_ptr;
+}
+
+inline constexpr uptr RoundUpTo(uptr size, uptr boundary) {
+  return (size + boundary - 1) & ~(boundary - 1);
+}
+
+inline constexpr uptr RoundDownTo(uptr x, uptr boundary) {
+  return x & ~(boundary - 1);
+}
+
+bool MemIsZero(const char *beg, uptr size) {
+  const char *end = beg + size;
+  uptr *aligned_beg = (uptr *)RoundUpTo((uptr)beg, sizeof(uptr));
+  uptr *aligned_end = (uptr *)RoundDownTo((uptr)end, sizeof(uptr));
+  uptr all = 0;
+  // Prologue.
+  for (const char *mem = beg; mem < (char *)aligned_beg && mem < end; mem++)
+    all |= *mem;
+  // Aligned loop.
+  for (; aligned_beg < aligned_end; aligned_beg++)
+    all |= *aligned_beg;
+  // Epilogue.
+  if ((char *)aligned_end >= beg) {
+    for (const char *mem = (char *)aligned_end; mem < end; mem++)
+      all |= *mem;
+  }
+  return all == 0;
 }
 
-DEVICE_EXTERN_C_NOINLINE
-void __asan_load4(uptr addr) { __asan_load_n(addr, 4); }
+void print_shadow_memory(uptr addr, int32_t as) {
+  uptr shadow_address = MemToShadow(addr, as);
+  uptr p = shadow_address & (~0xf);
+  __spirv_ocl_printf(__asan_shadow_value_start, addr, as, p);
+  for (int i = 0; i < 0xf; ++i) {
+    u8 shadow_value = *(u8 *)(p + i);
+    if (p + i == shadow_address) {
+      __spirv_ocl_printf(__asan_current_shadow_value, shadow_value);
+    } else {
+      __spirv_ocl_printf(__asan_shadow_value, shadow_value);
+    }
+  }
+  __spirv_ocl_printf(__newline);
+}
+
+} // namespace
+
+bool __asan_region_is_value(uptr addr, int32_t as, std::size_t size,
+                            char value) {
+  if (size == 0)
+    return true;
+  while (size--) {
+    char *shadow = (char *)MemToShadow(addr, as);
+    if (*shadow != value) {
+      return false;
+    }
+    ++addr;
+  }
+  return true;
+}
+
+static void __asan_internal_report_save(
+    uptr ptr, int32_t as, const char __SYCL_CONSTANT__ *file, int32_t line,
+    const char __SYCL_CONSTANT__ *func, bool is_write, uint32_t access_size,
+    DeviceSanitizerMemoryType memory_type, DeviceSanitizerErrorType error_type,
+    bool is_recover = false) {
+
+  const int Expected = ASAN_REPORT_NONE;
+  int Desired = ASAN_REPORT_START;
+  if (atomicCompareAndSet(&__DeviceSanitizerReportMem.get().Flag, Desired,
+                          Expected) == Expected) {
+
+    int FileLength = 0;
+    int FuncLength = 0;
+
+    if (file)
+      for (auto *C = file; *C != '\0'; ++C, ++FileLength)
+        ;
+    if (func)
+      for (auto *C = func; *C != '\0'; ++C, ++FuncLength)
+        ;
+
+    int MaxFileIdx = sizeof(__DeviceSanitizerReportMem.get().File) - 1;
+    int MaxFuncIdx = sizeof(__DeviceSanitizerReportMem.get().Func) - 1;
+
+    if (FileLength < MaxFileIdx)
+      MaxFileIdx = FileLength;
+    if (FuncLength < MaxFuncIdx)
+      MaxFuncIdx = FuncLength;
+
+    for (int Idx = 0; Idx < MaxFileIdx; ++Idx)
+      __DeviceSanitizerReportMem.get().File[Idx] = file[Idx];
+    __DeviceSanitizerReportMem.get().File[MaxFileIdx] = '\0';
+
+    for (int Idx = 0; Idx < MaxFuncIdx; ++Idx)
+      __DeviceSanitizerReportMem.get().Func[Idx] = func[Idx];
+    __DeviceSanitizerReportMem.get().Func[MaxFuncIdx] = '\0';
+
+    __DeviceSanitizerReportMem.get().Line = line;
+    __DeviceSanitizerReportMem.get().GID0 = __spirv_GlobalInvocationId_x();
+    __DeviceSanitizerReportMem.get().GID1 = __spirv_GlobalInvocationId_y();
+    __DeviceSanitizerReportMem.get().GID2 = __spirv_GlobalInvocationId_z();
+    __DeviceSanitizerReportMem.get().LID0 = __spirv_LocalInvocationId_x();
+    __DeviceSanitizerReportMem.get().LID1 = __spirv_LocalInvocationId_y();
+    __DeviceSanitizerReportMem.get().LID2 = __spirv_LocalInvocationId_z();
+
+    __DeviceSanitizerReportMem.get().IsWrite = is_write;
+    __DeviceSanitizerReportMem.get().AccessSize = access_size;
+    __DeviceSanitizerReportMem.get().ErrorType = error_type;
+    __DeviceSanitizerReportMem.get().MemoryType = memory_type;
+    __DeviceSanitizerReportMem.get().IsRecover = is_recover;
+
+    // Show we've done copying
+    atomicStore(&__DeviceSanitizerReportMem.get().Flag, ASAN_REPORT_FINISH);
+  }
+}
+
+///
+/// ASAN Error Reporters
+///
+
+void __asan_report_access_error(uptr addr, int32_t as, size_t size,
+                                bool is_write, uptr poisoned_addr,
+                                const char __SYCL_CONSTANT__ *file,
+                                int32_t line,
+                                const char __SYCL_CONSTANT__ *func,
+                                bool is_recover = false) {
+  // Check Error Type
+  s8 *shadow_address = (s8 *)MemToShadow(poisoned_addr, as);
+  int shadow_value = *shadow_address;
+  if (shadow_value > 0) {
+    shadow_value = *(shadow_address + 1);
+  }
+  // FIXME: check if shadow_address out-of-bound
+
+  DeviceSanitizerMemoryType memory_type;
+  DeviceSanitizerErrorType error_type;
+
+  switch (shadow_value) {
+  case kUsmDeviceRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_DEVICE;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  case kUsmHostRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_HOST;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  case kUsmSharedRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_SHARED;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  case kUsmDeviceDeallocatedMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_DEVICE;
+    error_type = DeviceSanitizerErrorType::USE_AFTER_FREE;
+    break;
+  case kUsmHostDeallocatedMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_HOST;
+    error_type = DeviceSanitizerErrorType::USE_AFTER_FREE;
+    break;
+  case kUsmSharedDeallocatedMagic:
+    memory_type = DeviceSanitizerMemoryType::USM_SHARED;
+    error_type = DeviceSanitizerErrorType::USE_AFTER_FREE;
+    break;
+  case kPrivateLeftRedzoneMagic:
+  case kPrivateMidRedzoneMagic:
+  case kPrivateRightRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::PRIVATE;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  case kMemBufferRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::MEM_BUFFER;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  case kSharedLocalRedzoneMagic:
+    memory_type = DeviceSanitizerMemoryType::LOCAL;
+    error_type = DeviceSanitizerErrorType::OUT_OF_BOUND;
+    break;
+  default:
+    memory_type = DeviceSanitizerMemoryType::UNKNOWN;
+    error_type = DeviceSanitizerErrorType::UNKNOWN;
+  }
+
+  __asan_internal_report_save(addr, as, file, line, func, is_write, size,
+                              memory_type, error_type, is_recover);
+}
+
+///
+/// Check if memory is poisoned
+///
+
+// NOTE: size < 8
+inline int __asan_address_is_poisoned(uptr a, int32_t as, size_t size) {
+  auto *shadow_address = (s8 *)MemToShadow(a, as);
+  if (shadow_address) {
+    auto shadow_value = *shadow_address;
+    if (shadow_value) {
+      s8 last_accessed_byte = (a & (ASAN_SHADOW_GRANULARITY - 1)) + size - 1;
+      return (last_accessed_byte >= shadow_value);
+    }
+  }
+  return false;
+}
+
+// NOTE: size = 1
+inline int __asan_address_is_poisoned(uptr a, int32_t as) {
+  return __asan_address_is_poisoned(a, as, 1);
+}
+
+inline uptr __asan_region_is_poisoned(uptr beg, int32_t as, size_t size) {
+  if (!size)
+    return 0;
+
+  uptr end = beg + size;
+  uptr aligned_b = RoundUpTo(beg, ASAN_SHADOW_GRANULARITY);
+  uptr aligned_e = RoundDownTo(end, ASAN_SHADOW_GRANULARITY);
+
+  uptr shadow_beg = MemToShadow(aligned_b, as);
+  if (!shadow_beg) {
+    return 0;
+  }
+  uptr shadow_end = MemToShadow(aligned_e, as);
+  if (!shadow_end) {
+    return 0;
+  }
+
+  // First check the first and the last application bytes,
+  // then check the ASAN_SHADOW_GRANULARITY-aligned region by calling
+  // MemIsZero on the corresponding shadow.
+  if (!__asan_address_is_poisoned(beg, as) &&
+      !__asan_address_is_poisoned(end - 1, as) &&
+      (shadow_end <= shadow_beg ||
+       MemIsZero((const char *)shadow_beg, shadow_end - shadow_beg)))
+    return 0;
+
+  // The fast check failed, so we have a poisoned byte somewhere.
+  // Find it slowly.
+  for (; beg < end; beg++)
+    if (__asan_address_is_poisoned(beg, as))
+      return beg;
+
+  return 0;
+}
+
+///
+/// ASAN Load/Store Report Built-ins
+///
+
+#define ASAN_REPORT_ERROR(type, is_write, size)                                \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size(                           \
+      uptr addr, int32_t as, const char __SYCL_CONSTANT__ *file, int32_t line, \
+      const char __SYCL_CONSTANT__ *func) {                                    \
+    if (__asan_address_is_poisoned(addr, as, size)) {                          \
+      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
+                                 func);                                        \
+    }                                                                          \
+  }                                                                            \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_noabort(                 \
+      uptr addr, int32_t as, const char __SYCL_CONSTANT__ *file, int32_t line, \
+      const char __SYCL_CONSTANT__ *func) {                                    \
+    if (__asan_address_is_poisoned(addr, as, size)) {                          \
+      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
+                                 func, true);                                  \
+    }                                                                          \
+  }
+
+ASAN_REPORT_ERROR(load, false, 1)
+ASAN_REPORT_ERROR(load, false, 2)
+ASAN_REPORT_ERROR(load, false, 4)
+ASAN_REPORT_ERROR(store, true, 1)
+ASAN_REPORT_ERROR(store, true, 2)
+ASAN_REPORT_ERROR(store, true, 4)
+
+#define ASAN_REPORT_ERROR_BYTE(type, is_write, size)                           \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size(                           \
+      uptr addr, int32_t as, const char __SYCL_CONSTANT__ *file, int32_t line, \
+      const char __SYCL_CONSTANT__ *func) {                                    \
+    u##size *shadow_address = (u##size *)MemToShadow(addr, as);                \
+    if (shadow_address && *shadow_address) {                                   \
+      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
+                                 func);                                        \
+    }                                                                          \
+  }                                                                            \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_noabort(                 \
+      uptr addr, int32_t as, const char __SYCL_CONSTANT__ *file, int32_t line, \
+      const char __SYCL_CONSTANT__ *func) {                                    \
+    u##size *shadow_address = (u##size *)MemToShadow(addr, as);                \
+    if (shadow_address && *shadow_address) {                                   \
+      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
+                                 func, true);                                  \
+    }                                                                          \
+  }
+
+ASAN_REPORT_ERROR_BYTE(load, false, 8)
+ASAN_REPORT_ERROR_BYTE(load, false, 16)
+ASAN_REPORT_ERROR_BYTE(store, true, 8)
+ASAN_REPORT_ERROR_BYTE(store, true, 16)
+
+#define ASAN_REPORT_ERROR_N(type, is_write)                                    \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##N(                              \
+      uptr addr, size_t size, int32_t as, const char __SYCL_CONSTANT__ *file,  \
+      int32_t line, const char __SYCL_CONSTANT__ *func) {                      \
+    if (auto poisoned_addr = __asan_region_is_poisoned(addr, as, size)) {      \
+      __asan_report_access_error(addr, as, size, is_write, poisoned_addr,      \
+                                 file, line, func);                            \
+    }                                                                          \
+  }                                                                            \
+  DEVICE_EXTERN_C_NOINLINE void __asan_##type##N_noabort(                      \
+      uptr addr, size_t size, int32_t as, const char __SYCL_CONSTANT__ *file,  \
+      int32_t line, const char __SYCL_CONSTANT__ *func) {                      \
+    if (auto poisoned_addr = __asan_region_is_poisoned(addr, as, size)) {      \
+      __asan_report_access_error(addr, as, size, is_write, poisoned_addr,      \
+                                 file, line, func, true);                      \
+    }                                                                          \
+  }
+
+ASAN_REPORT_ERROR_N(load, false)
+ASAN_REPORT_ERROR_N(store, true)
+
+DEVICE_EXTERN_C_NOINLINE void
+__asan_set_shadow_local_memory(uptr ptr, size_t size,
+                               size_t size_with_redzone) {
+  uptr aligned_size = RoundUpTo(size, ASAN_SHADOW_GRANULARITY);
+
+  {
+    auto shadow_address = MemToShadow(ptr + aligned_size, AS_LOCAL);
+    auto count = (size_with_redzone - aligned_size) / ASAN_SHADOW_GRANULARITY;
+    for (size_t i = 0; i < count; ++i) {
+      ((u8 *)shadow_address)[i] = kSharedLocalRedzoneMagic;
+    }
+  }
+
+  if (size != aligned_size) {
+    auto user_end = ptr + size - 1;
+    auto *shadow_end = (s8 *)MemToShadow(user_end, AS_LOCAL);
+    *shadow_end = user_end - RoundDownTo(user_end, ASAN_SHADOW_GRANULARITY);
+  }
+}
 
-DEVICE_EXTERN_C_NOINLINE
-void __asan_load8(uptr addr) { __asan_load_n(addr, 8); }
-#endif // __SPIR__
+#endif
diff --git a/libdevice/spirv_vars.h b/libdevice/spirv_vars.h
index 640f0dbf3cd9e..0387c322965ce 100644
--- a/libdevice/spirv_vars.h
+++ b/libdevice/spirv_vars.h
@@ -21,6 +21,7 @@ typedef size_t size_t_vec __attribute__((ext_vector_type(3)));
 __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInGlobalInvocationId;
 __SPIRV_VAR_QUALIFIERS size_t __spirv_BuiltInGlobalLinearId;
 __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInLocalInvocationId;
+__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInNumWorkgroups;
 __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupId;
 __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupSize;
 
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8a2864a078731..22080d30e6c57 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -727,6 +727,13 @@ struct AddressSanitizer {
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
+  void instrumentSyclAllocateLocalMemory(CallInst *CI);
+
+  GlobalVariable *GetOrCreateGlobalString(Module &M, StringRef Name,
+                                          StringRef Value,
+                                          unsigned AddressSpace);
+  void AppendDebugInfoToArgs(Instruction *InsertBefore, Value *Addr,
+                             SmallVectorImpl<Value *> &Args);
 
 private:
   friend struct FunctionStackPoisoner;
@@ -768,7 +775,10 @@ struct AddressSanitizer {
   ShadowMapping Mapping;
   FunctionCallee AsanHandleNoReturnFunc;
   FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
+  FunctionCallee AsanSetShadowDeviceLocalFunc;
   Constant *AsanShadowGlobal;
+  Constant *AsanShadowDevicePrivate;
+  StringMap<GlobalVariable *> GlobalStringMap;
 
   // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize).
   FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes];
@@ -1234,7 +1244,78 @@ static bool isUnsupportedAMDGPUAddrspace(Value *Addr) {
   return false;
 }
 
+static bool isUnsupportedSPIRAccess(Value *Addr, Function *Func) {
+  Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+  // Private address space: skip kernel arguments
+  if (PtrTy->getPointerAddressSpace() == 0) {
+    return Func->getCallingConv() == CallingConv::SPIR_KERNEL &&
+           isa<Argument>(Addr);
+  }
+
+  // All the rest address spaces: skip SPIR-V built-in varibles
+  auto *OrigValue = Addr->stripPointerCasts();
+  return OrigValue->getName().starts_with("__spirv_BuiltIn");
+}
+
+GlobalVariable *AddressSanitizer::GetOrCreateGlobalString(
+    Module &M, StringRef Name, StringRef Value, unsigned AddressSpace) {
+  GlobalVariable *StringGV = nullptr;
+  if (GlobalStringMap.find(Value.str()) != GlobalStringMap.end())
+    return GlobalStringMap.at(Value.str());
+
+  auto *Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), Value.size() + 1);
+  StringGV = new GlobalVariable(
+      M, Ty, true, GlobalValue::InternalLinkage,
+      ConstantDataArray::getString(M.getContext(), Value), Name, nullptr,
+      GlobalValue::NotThreadLocal, AddressSpace);
+  GlobalStringMap[Value.str()] = StringGV;
+
+  return StringGV;
+}
+
+void AddressSanitizer::AppendDebugInfoToArgs(Instruction *InsertBefore,
+                                             Value *Addr,
+                                             SmallVectorImpl<Value *> &Args) {
+  auto *M = InsertBefore->getModule();
+  auto &C = InsertBefore->getContext();
+  auto &Loc = InsertBefore->getDebugLoc();
+
+  // SPIR constant address space
+  constexpr unsigned ConstantAS = 2;
+  PointerType *ConstASPtrTy = Type::getInt8Ty(C)->getPointerTo(ConstantAS);
+
+  // Address Space
+  Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+  Args.push_back(
+      ConstantInt::get(Type::getInt32Ty(C), PtrTy->getPointerAddressSpace()));
+
+  // File & Line
+  if (Loc) {
+    StringRef FileName = Loc->getFilename();
+    auto *FileNameGV =
+        GetOrCreateGlobalString(*M, "__asan_file", FileName, ConstantAS);
+    Args.push_back(ConstantExpr::getPointerCast(FileNameGV, ConstASPtrTy));
+    Args.push_back(ConstantInt::get(Type::getInt32Ty(C), Loc.getLine()));
+  } else {
+    Args.push_back(ConstantPointerNull::get(ConstASPtrTy));
+    Args.push_back(ConstantInt::get(Type::getInt32Ty(C), 0));
+  }
+
+  // Function
+  auto FuncName = InsertBefore->getFunction()->getName();
+  auto *FuncNameGV = GetOrCreateGlobalString(*M, "__asan_func",
+                                             demangle(FuncName), ConstantAS);
+  Args.push_back(ConstantExpr::getPointerCast(FuncNameGV, ConstASPtrTy));
+}
+
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  if (TargetTriple.isSPIR()) {
+    // ((Shadow & 0xffffffff) >> 3) + __AsanShadowMemoryPrivateStart;
+    Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, 0xffffffff));
+    Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+    Value *ShadowBase = IRB.CreateLoad(IntptrTy, AsanShadowDevicePrivate);
+    return IRB.CreateAdd(Shadow, ShadowBase);
+  }
   // Shadow >> scale
   Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
   if (Mapping.Offset == 0) return Shadow;
@@ -1250,6 +1331,53 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
     return IRB.CreateAdd(Shadow, ShadowBase);
 }
 
+static uint64_t getSizeAndRedzoneSizeForLocal(uint64_t Size,
+                                              uint64_t Granularity,
+                                              uint64_t Alignment) {
+  uint64_t Res = 0;
+  if (Size <= 4)
+    Res = 16;
+  else if (Size <= 16)
+    Res = 32;
+  else if (Size <= 128)
+    Res = Size + 32;
+  else if (Size <= 512)
+    Res = Size + 64;
+  else if (Size <= 4096)
+    Res = Size + 128;
+  else
+    Res = Size + 256;
+  return alignTo(std::max(Res, 2 * Granularity), Alignment);
+}
+
+// Instument __sycl_allocateLocalMemory
+void AddressSanitizer::instrumentSyclAllocateLocalMemory(CallInst *CI) {
+  InstrumentationIRBuilder IRB(CI->getNextNode());
+  auto *Size = cast<ConstantInt>(CI->getArgOperand(0));
+  auto *Alignment = cast<ConstantInt>(CI->getArgOperand(1));
+
+  const auto Granularity = 1 << Mapping.Scale;
+  // The base address of local memory needs to align to granularity
+  const auto Align = alignTo(Alignment->getZExtValue(), Granularity);
+
+  auto *SizeWithRedZone = ConstantInt::get(
+      IntptrTy, getSizeAndRedzoneSizeForLocal(Size->getZExtValue(), Granularity,
+                                              Alignment->getZExtValue()));
+
+  auto *NewCI =
+      IRB.CreateCall(CI->getCalledFunction(),
+                     {SizeWithRedZone, ConstantInt::get(IntptrTy, Align)});
+
+  /// __asan_set_shadow_local_memory(uptr beg, size_t size, size_t
+  /// size_with_redzone)
+  IRB.CreateCall(
+      AsanSetShadowDeviceLocalFunc,
+      {IRB.CreatePointerCast(NewCI, IntptrTy), Size, SizeWithRedZone});
+
+  CI->replaceAllUsesWith(NewCI);
+  CI->eraseFromParent();
+}
+
 // Instrument memset/memmove/memcpy
 void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   InstrumentationIRBuilder IRB(MI);
@@ -1295,11 +1423,18 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
 }
 
 bool AddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
-  // Instrument accesses from different address spaces only for AMDGPU.
-  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
-  if (PtrTy->getPointerAddressSpace() != 0 &&
-      !(TargetTriple.isAMDGPU() && !isUnsupportedAMDGPUAddrspace(Ptr)))
-    return true;
+  // SPIR has its own rules to filter the instrument accesses
+  if (TargetTriple.isSPIR()) {
+    if (isUnsupportedSPIRAccess(Ptr, Inst->getFunction()))
+      return true;
+  } else {
+    // Instrument accesses from different address spaces only for AMDGPU.
+    Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0 &&
+        !(TargetTriple.isAMDGPU() && !isUnsupportedAMDGPUAddrspace(Ptr))) {
+      return true;
+    }
+  }
 
   // Ignore swifterror addresses.
   // swifterror memory addresses are mem2reg promoted by instruction
@@ -1778,12 +1913,21 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (UseCalls) {
-    if (Exp == 0)
-      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
-                     AddrLong);
-    else
+    if (Exp == 0) {
+      if (TargetTriple.isSPIR()) {
+        SmallVector<Value *, 5> Args;
+        Args.push_back(AddrLong);
+        AppendDebugInfoToArgs(InsertBefore, Addr, Args);
+        IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                       Args);
+      } else {
+        IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                       AddrLong);
+      }
+    } else {
       IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
                      {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+    }
     return;
   }
 
@@ -1849,10 +1993,18 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(
 
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (UseCalls) {
-    if (Exp == 0)
-      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
-                     {AddrLong, Size});
-    else
+    if (Exp == 0) {
+      if (TargetTriple.isSPIR()) {
+        SmallVector<Value *, 6> Args;
+        Args.push_back(AddrLong);
+        Args.push_back(Size);
+        AppendDebugInfoToArgs(InsertBefore, Addr, Args);
+        IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0], Args);
+      } else {
+        IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+                       {AddrLong, Size});
+      }
+    } else
       IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
                      {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
   } else {
@@ -2667,6 +2819,12 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) {
     }
   }
 
+  // SPIR kernel needn't AsanCtorFunction & AsanDtorFunction
+  if (TargetTriple.isSPIR()) {
+    AsanCtorFunction = nullptr;
+    AsanDtorFunction = nullptr;
+  }
+
   const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
 
   // Put the constructor and destructor in comdat if both
@@ -2714,6 +2872,23 @@ void AddressSanitizer::initializeCallbacks(Module &M, const TargetLibraryInfo *T
           AL1 = AL1.addParamAttribute(*C, 1, AK);
         }
       }
+
+      // Extend __asan_load/store arguments: unsigned int address_space, char*
+      // file, unsigned int line, char* func
+      if (TargetTriple.isSPIR()) {
+        constexpr unsigned ConstantAS = 2;
+        auto *Int8PtrTy = Type::getInt8Ty(*C)->getPointerTo(ConstantAS);
+
+        Args1.push_back(Type::getInt32Ty(*C)); // address_space
+        Args1.push_back(Int8PtrTy);            // file
+        Args1.push_back(Type::getInt32Ty(*C)); // line
+        Args1.push_back(Int8PtrTy);            // func
+
+        Args2.push_back(Type::getInt32Ty(*C)); // address_space
+        Args2.push_back(Int8PtrTy);            // file
+        Args2.push_back(Type::getInt32Ty(*C)); // line
+        Args2.push_back(Int8PtrTy);            // func
+      }
       AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
           kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
           FunctionType::get(IRB.getVoidTy(), Args2, false), AL2);
@@ -2761,6 +2936,20 @@ void AddressSanitizer::initializeCallbacks(Module &M, const TargetLibraryInfo *T
     AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
                                            ArrayType::get(IRB.getInt8Ty(), 0));
 
+  if (TargetTriple.isSPIR()) {
+    AsanShadowDevicePrivate =
+        M.getOrInsertGlobal("__AsanShadowMemoryPrivateStart", IntptrTy, [&] {
+          return new GlobalVariable(M, IntptrTy, true,
+                                    GlobalVariable::ExternalLinkage, nullptr,
+                                    "__AsanShadowMemoryPrivateStart", nullptr,
+                                    GlobalVariable::NotThreadLocal, 1);
+        });
+
+    AsanSetShadowDeviceLocalFunc =
+        M.getOrInsertFunction("__asan_set_shadow_local_memory", IRB.getVoidTy(),
+                              IntptrTy, IntptrTy, IntptrTy);
+  }
+
   AMDGPUAddressShared =
       M.getOrInsertFunction(kAMDGPUAddressSharedName, IRB.getInt1Ty(), PtrTy);
   AMDGPUAddressPrivate =
@@ -2856,6 +3045,8 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().starts_with("__asan_")) return false;
+  if (F.getName().contains("__sycl_service_kernel__"))
+    return false;
 
   bool FunctionModified = false;
 
@@ -2891,6 +3082,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   SmallVector<Instruction *, 8> NoReturnCalls;
   SmallVector<BasicBlock *, 16> AllBlocks;
   SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+  SmallVector<CallInst *, 8> SyclAllocateLocalMemoryCalls;
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
@@ -2939,8 +3131,16 @@ bool AddressSanitizer::instrumentFunction(Function &F,
           if (CB->doesNotReturn())
             NoReturnCalls.push_back(CB);
         }
-        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
-          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
+          if (TargetTriple.isSPIR() && CI->getCalledFunction() &&
+              CI->getCalledFunction()->getCallingConv() ==
+                  llvm::CallingConv::SPIR_FUNC &&
+              CI->getCalledFunction()->getName() ==
+                  "__sycl_allocateLocalMemory")
+            SyclAllocateLocalMemoryCalls.push_back(CI);
+          else
+            maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        }
       }
       if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
     }
@@ -2962,10 +3162,17 @@ bool AddressSanitizer::instrumentFunction(Function &F,
                     F.getParent()->getDataLayout());
     FunctionModified = true;
   }
-  for (auto *Inst : IntrinToInstrument) {
-    if (!suppressInstrumentationSiteForDebug(NumInstrumented))
-      instrumentMemIntrinsic(Inst);
-    FunctionModified = true;
+  if (TargetTriple.isSPIR()) {
+    for (auto *CI : SyclAllocateLocalMemoryCalls) {
+      instrumentSyclAllocateLocalMemory(CI);
+      FunctionModified = true;
+    }
+  } else {
+    for (auto *Inst : IntrinToInstrument) {
+      if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+        instrumentMemIntrinsic(Inst);
+      FunctionModified = true;
+    }
   }
 
   FunctionStackPoisoner FSP(F, *this);
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3721564890ddb..956a2e940e96b 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -294,6 +294,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   if (Name == "memset" || Name == "memcpy")
     return false;
 
+  // Prevent from asan interception in kernel
+  if (Name == "__asan_set_shadow_local_memory")
+    return false;
+
   // Determine if code size heuristics need to be applied.
   ApplyCodeSizeHeuristics =
       L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
diff --git a/llvm/test/Instrumentation/AddressSanitizer/spir.ll b/llvm/test/Instrumentation/AddressSanitizer/spir.ll
new file mode 100644
index 0000000000000..cee6a67605c5d
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/spir.ll
@@ -0,0 +1,156 @@
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -S | FileCheck %s
+
+; ModuleID = 'spir.cpp'
+source_filename = "spir.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
+%"class.sycl::_V1::detail::array" = type { [1 x i64] }
+%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" }
+
+$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E10FillBuffer = comdat any
+
+@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+; CHECK: __AsanShadowMemoryPrivateStart
+
+; Function Attrs: convergent mustprogress norecurse nounwind sanitize_address uwtable
+define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E10FillBuffer(ptr addrspace(1) noundef align 8 %_arg_Sum, ptr addrspace(1) noundef align 8 %_arg_Accessor, ptr noundef byval(%"class.sycl::_V1::range") align 8 %_arg_Accessor1, ptr noundef byval(%"class.sycl::_V1::range") align 8 %_arg_Accessor2, ptr noundef byval(%"class.sycl::_V1::id") align 8 %_arg_Accessor3) local_unnamed_addr #0 comdat !srcloc !65 !kernel_arg_buffer_location !66 !kernel_arg_runtime_aligned !67 !kernel_arg_exclusive_ptr !67 !sycl_fixed_targets !68 {
+entry:
+  %0 = load i64, ptr %_arg_Accessor3, align 8
+  %1 = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32, !noalias !68
+  %call.i10 = tail call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef 16, i64 noundef 4) #5, !noalias !69
+  ; CHECK: __asan_set_shadow_local_memory
+  %cmp.i = icmp eq i64 %1, 0
+  br i1 %cmp.i, label %if.then.i, label %_ZN4sycl3_V13ext6oneapi18group_local_memoryIA4_iNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS8_LNS0_6access13address_spaceE3ELNSB_9decoratedE2EEEE4typeES9_DpOT1_.exit
+
+if.then.i:                                        ; preds = %entry
+  call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 %call.i10, i8 0, i64 16, i1 false), !noalias !69
+  br label %_ZN4sycl3_V13ext6oneapi18group_local_memoryIA4_iNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS8_LNS0_6access13address_spaceE3ELNSB_9decoratedE2EEEE4typeES9_DpOT1_.exit
+
+_ZN4sycl3_V13ext6oneapi18group_local_memoryIA4_iNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS8_LNS0_6access13address_spaceE3ELNSB_9decoratedE2EEEE4typeES9_DpOT1_.exit: ; preds = %entry, %if.then.i
+  %add.ptr.i = getelementptr inbounds i64, ptr addrspace(1) %_arg_Accessor, i64 %0
+  tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 2, i32 noundef 2, i32 noundef 272) #5, !noalias !69
+  %cmp.i14 = icmp ult i64 %1, 2147483648
+  tail call void @llvm.assume(i1 %cmp.i14)
+  %arrayidx.i16 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr.i, i64 %1
+  %2 = load i64, ptr addrspace(1) %arrayidx.i16, align 8, !tbaa !72
+  ; CHECK: __asan_load8
+  %arrayidx.i = getelementptr inbounds [4 x i32], ptr addrspace(3) %call.i10, i64 0, i64 %1
+  %3 = load i32, ptr addrspace(3) %arrayidx.i, align 4, !tbaa !76
+  ; CHECK: __asan_load4
+  %conv.i = sext i32 %3 to i64
+  %add.i = add i64 %2, %conv.i
+  %4 = load i64, ptr addrspace(1) %_arg_Sum, align 8, !tbaa !72
+  ; CHECK: __asan_load8
+  %add5.i = add i64 %4, %add.i
+  store i64 %add5.i, ptr addrspace(1) %_arg_Sum, align 8, !tbaa !72
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p4.i64(ptr addrspace(4) nocapture writeonly %0, i8 %1, i64 %2, i1 immarg %3) #1
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef %0, i64 noundef %1) local_unnamed_addr #2
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef %0) #3
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly %0, i8 %1, i64 %2, i1 immarg %3) #4
+
+attributes #0 = { convergent mustprogress norecurse nounwind sanitize_address uwtable "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="test.cpp" "uniform-work-group-size"="true" }
+attributes #1 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #2 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!opencl.spir.version = !{!3}
+!spirv.Source = !{!4}
+!sycl_aspects = !{!5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63}
+!llvm.ident = !{!64}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{i32 1, i32 2}
+!4 = !{i32 4, i32 100000}
+!5 = !{!"cpu", i32 1}
+!6 = !{!"gpu", i32 2}
+!7 = !{!"accelerator", i32 3}
+!8 = !{!"custom", i32 4}
+!9 = !{!"fp16", i32 5}
+!10 = !{!"fp64", i32 6}
+!11 = !{!"image", i32 9}
+!12 = !{!"online_compiler", i32 10}
+!13 = !{!"online_linker", i32 11}
+!14 = !{!"queue_profiling", i32 12}
+!15 = !{!"usm_device_allocations", i32 13}
+!16 = !{!"usm_host_allocations", i32 14}
+!17 = !{!"usm_shared_allocations", i32 15}
+!18 = !{!"usm_system_allocations", i32 17}
+!19 = !{!"ext_intel_pci_address", i32 18}
+!20 = !{!"ext_intel_gpu_eu_count", i32 19}
+!21 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
+!22 = !{!"ext_intel_gpu_slices", i32 21}
+!23 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
+!24 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
+!25 = !{!"ext_intel_max_mem_bandwidth", i32 24}
+!26 = !{!"ext_intel_mem_channel", i32 25}
+!27 = !{!"usm_atomic_host_allocations", i32 26}
+!28 = !{!"usm_atomic_shared_allocations", i32 27}
+!29 = !{!"atomic64", i32 28}
+!30 = !{!"ext_intel_device_info_uuid", i32 29}
+!31 = !{!"ext_oneapi_srgb", i32 30}
+!32 = !{!"ext_oneapi_native_assert", i32 31}
+!33 = !{!"host_debuggable", i32 32}
+!34 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
+!35 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
+!36 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
+!37 = !{!"ext_intel_free_memory", i32 36}
+!38 = !{!"ext_intel_device_id", i32 37}
+!39 = !{!"ext_intel_memory_clock_rate", i32 38}
+!40 = !{!"ext_intel_memory_bus_width", i32 39}
+!41 = !{!"emulated", i32 40}
+!42 = !{!"ext_intel_legacy_image", i32 41}
+!43 = !{!"ext_oneapi_bindless_images", i32 42}
+!44 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43}
+!45 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44}
+!46 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45}
+!47 = !{!"ext_oneapi_interop_memory_import", i32 46}
+!48 = !{!"ext_oneapi_interop_memory_export", i32 47}
+!49 = !{!"ext_oneapi_interop_semaphore_import", i32 48}
+!50 = !{!"ext_oneapi_interop_semaphore_export", i32 49}
+!51 = !{!"ext_oneapi_mipmap", i32 50}
+!52 = !{!"ext_oneapi_mipmap_anisotropy", i32 51}
+!53 = !{!"ext_oneapi_mipmap_level_reference", i32 52}
+!54 = !{!"ext_intel_esimd", i32 53}
+!55 = !{!"ext_oneapi_ballot_group", i32 54}
+!56 = !{!"ext_oneapi_fixed_size_group", i32 55}
+!57 = !{!"ext_oneapi_opportunistic_group", i32 56}
+!58 = !{!"ext_oneapi_tangle_group", i32 57}
+!59 = !{!"int64_base_atomics", i32 7}
+!60 = !{!"int64_extended_atomics", i32 8}
+!61 = !{!"usm_system_allocator", i32 17}
+!62 = !{!"usm_restricted_shared_allocations", i32 16}
+!63 = !{!"host", i32 0}
+!64 = !{!"clang version 18.0.0git (https://github.com/intel/llvm.git caecf6b928648a83c8ceb84988231cb246c4365e)"}
+!65 = !{i32 419}
+!66 = !{i32 -1, i32 -1, i32 -1, i32 -1, i32 -1}
+!67 = !{i1 false, i1 true, i1 false, i1 false, i1 false}
+!68 = !{}
+!69 = !{!70}
+!70 = distinct !{!70, !71, !"_ZN4sycl3_V13ext6oneapi18group_local_memoryIA4_iNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS8_LNS0_6access13address_spaceE3ELNSB_9decoratedE2EEEE4typeES9_DpOT1_: %agg.result"}
+!71 = distinct !{!71, !"_ZN4sycl3_V13ext6oneapi18group_local_memoryIA4_iNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS8_LNS0_6access13address_spaceE3ELNSB_9decoratedE2EEEE4typeES9_DpOT1_"}
+!72 = !{!73, !73, i64 0}
+!73 = !{!"long", !74, i64 0}
+!74 = !{!"omnipotent char", !75, i64 0}
+!75 = !{!"Simple C++ TBAA"}
+!76 = !{!77, !77, i64 0}
+!77 = !{!"int", !74, i64 0}
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 50cb225f6cb18..85728be8aabf0 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -313,8 +313,9 @@ std::vector<StringRef> getKernelNamesUsingAssert(const Module &M) {
 }
 
 bool isModuleUsingAsan(const Module &M) {
-  auto *AsanInitFunction = M.getFunction("__asan_init");
-  return AsanInitFunction;
+  return llvm::any_of(M.functions(), [](const Function &F) {
+    return F.getName().starts_with("__asan_");
+  });
 }
 
 // Gets reqd_work_group_size information for function Func.
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2aa0423e1e663..def19a4151124 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -202,6 +202,13 @@ endif()
 
 if ("opencl" IN_LIST SYCL_ENABLE_PLUGINS)
     add_dependencies(sycl-runtime-libraries ur_adapter_opencl)
+
+    # Install the UR adapters too
+    install(TARGETS ur_adapter_opencl
+      LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev
+      ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev
+      RUNTIME DESTINATION "bin" COMPONENT level-zero-sycl-dev
+    )
 endif()
 
 if ("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS)
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_char.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_char.cpp
new file mode 100644
index 0000000000000..84e28fd6e0883
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_char.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O1 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_HOST -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-HOST --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_SHARED -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-SHARED --input-file %t.txt %s
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t N = 12345;
+#if defined(MALLOC_HOST)
+  auto *array = sycl::malloc_host<char>(N, Q);
+#elif defined(MALLOC_SHARED)
+  auto *array = sycl::malloc_shared<char>(N, Q);
+#elif defined(MALLOC_DEVICE)
+  auto *array = sycl::malloc_device<char>(N, Q);
+#elif defined(MALLOC_SYSTEM)
+  auto *array = new char[N];
+#else
+#error "Must provide malloc type to run the test"
+#endif
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernelR_4>(
+        sycl::nd_range<1>(N + 1, 1),
+        [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; });
+  });
+  Q.wait();
+  // CHECK-DEVICE: ERROR: DeviceSanitizer: out-of-bounds-access on USM Device Memory
+  // CHECK-HOST:   ERROR: DeviceSanitizer: out-of-bounds-access on USM Host Memory
+  // CHECK-SHARED: ERROR: DeviceSanitizer: out-of-bounds-access on USM Shared Memory
+  // CHECK: {{READ of size 1 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(12345, 0, 0\)}}
+  // CHECK: {{  #0 .* .*parallel_for_char.cpp:}}[[@LINE-7]]
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_double.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_double.cpp
new file mode 100644
index 0000000000000..6644d6ba81f06
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_double.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: linux, cpu, aspect-fp64
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O1 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_HOST -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-HOST --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_SHARED -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-SHARED --input-file %t.txt %s
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t N = 123456;
+#if defined(MALLOC_HOST)
+  auto *array = sycl::malloc_host<double>(N, Q);
+#elif defined(MALLOC_SHARED)
+  auto *array = sycl::malloc_shared<double>(N, Q);
+#elif defined(MALLOC_DEVICE)
+  auto *array = sycl::malloc_device<double>(N, Q);
+#elif defined(MALLOC_SYSTEM)
+  auto *array = new double[N];
+#else
+#error "Must provide malloc type to run the test"
+#endif
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernelR_4>(
+        sycl::nd_range<1>(N + 1, 1),
+        [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; });
+  });
+  Q.wait();
+  // CHECK-DEVICE: ERROR: DeviceSanitizer: out-of-bounds-access on USM Device Memory
+  // CHECK-HOST:   ERROR: DeviceSanitizer: out-of-bounds-access on USM Host Memory
+  // CHECK-SHARED: ERROR: DeviceSanitizer: out-of-bounds-access on USM Shared Memory
+  // CHECK: {{READ of size 8 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(123456, 0, 0\)}}
+  // CHECK: {{  #0 .* .*parallel_for_double.cpp:}}[[@LINE-7]]
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_func.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_func.cpp
new file mode 100644
index 0000000000000..631effebe4515
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_func.cpp
@@ -0,0 +1,42 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O1 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_HOST -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-HOST --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_SHARED -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-SHARED --input-file %t.txt %s
+#include <sycl/sycl.hpp>
+
+__attribute__((noinline)) void foo(int *array, size_t i) { array[i] = 1; }
+// CHECK-DEVICE: ERROR: DeviceSanitizer: out-of-bounds-access on USM Device Memory
+// CHECK-HOST:   ERROR: DeviceSanitizer: out-of-bounds-access on USM Host Memory
+// CHECK-SHARED: ERROR: DeviceSanitizer: out-of-bounds-access on USM Shared Memory
+// CHECK: {{WRITE of size 4 at kernel <.*MyKernel> LID\(0, 0, 0\) GID\(123, 0, 0\)}}
+// CHECK: {{  #0 foo\(int\*, unsigned long\) .*parallel_for_func.cpp:}}[[@LINE-5]]
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t N = 123;
+#if defined(MALLOC_HOST)
+  auto *array = sycl::malloc_host<int>(N, Q);
+#elif defined(MALLOC_SHARED)
+  auto *array = sycl::malloc_shared<int>(N, Q);
+#elif defined(MALLOC_DEVICE)
+  auto *array = sycl::malloc_device<int>(N, Q);
+#elif defined(MALLOC_SYSTEM)
+  auto *array = new int[N];
+#else
+#error "Must provide malloc type to run the test"
+#endif
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernel>(
+        sycl::nd_range<1>(N + 1, 1),
+        [=](sycl::nd_item<1> item) { foo(array, item.get_global_id(0)); });
+  });
+  Q.wait();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_int.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_int.cpp
new file mode 100644
index 0000000000000..5fef6ec395bfc
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_int.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O1 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_HOST -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-HOST --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_SHARED -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-SHARED --input-file %t.txt %s
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t N = 1234567;
+#if defined(MALLOC_HOST)
+  auto *array = sycl::malloc_host<int>(N, Q);
+#elif defined(MALLOC_SHARED)
+  auto *array = sycl::malloc_shared<int>(N, Q);
+#elif defined(MALLOC_DEVICE)
+  auto *array = sycl::malloc_device<int>(N, Q);
+#elif defined(MALLOC_SYSTEM)
+  auto *array = new int[N];
+#else
+#error "Must provide malloc type to run the test"
+#endif
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernelR_4>(
+        sycl::nd_range<1>(N + 1, 1),
+        [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; });
+  });
+  Q.wait();
+  // CHECK-DEVICE: ERROR: DeviceSanitizer: out-of-bounds-access on USM Device Memory
+  // CHECK-HOST:   ERROR: DeviceSanitizer: out-of-bounds-access on USM Host Memory
+  // CHECK-SHARED: ERROR: DeviceSanitizer: out-of-bounds-access on USM Shared Memory
+  // CHECK: {{READ of size 4 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(1234567, 0, 0\)}}
+  // CHECK: {{  #0 .* .*parallel_for_int.cpp:}}[[@LINE-7]]
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_short.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_short.cpp
new file mode 100644
index 0000000000000..85f81619f858f
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/parallel_for_short.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O1 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_DEVICE -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-DEVICE --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_HOST -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-HOST --input-file %t.txt %s
+// RUN: %{build} %device_sanitizer_flags -DMALLOC_SHARED -O2 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t &> %t.txt ; FileCheck --check-prefixes CHECK,CHECK-SHARED --input-file %t.txt %s
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t N = 123456789;
+#if defined(MALLOC_HOST)
+  auto *array = sycl::malloc_host<short>(N, Q);
+#elif defined(MALLOC_SHARED)
+  auto *array = sycl::malloc_shared<short>(N, Q);
+#elif defined(MALLOC_DEVICE)
+  auto *array = sycl::malloc_device<short>(N, Q);
+#elif defined(MALLOC_SYSTEM)
+  auto *array = new short[N];
+#else
+#error "Must provide malloc type to run the test"
+#endif
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernelR_4>(
+        sycl::nd_range<1>(N + 1, 1),
+        [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; });
+  });
+  Q.wait();
+  // CHECK-DEVICE: ERROR: DeviceSanitizer: out-of-bounds-access on USM Device Memory
+  // CHECK-HOST:   ERROR: DeviceSanitizer: out-of-bounds-access on USM Host Memory
+  // CHECK-SHARED: ERROR: DeviceSanitizer: out-of-bounds-access on USM Shared Memory
+  // CHECK: {{READ of size 2 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(123456789, 0, 0\)}}
+  // CHECK: {{  #0 .* .*parallel_for_short.cpp:}}[[@LINE-7]]
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/local/local-overflow-1.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/local/local-overflow-1.cpp
new file mode 100644
index 0000000000000..b8278b209aa6a
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/local/local-overflow-1.cpp
@@ -0,0 +1,29 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_sanitizer_flags -g -o %t.out
+// RUN: env SYCL_PREFER_UR=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t.out 2>&1 | FileCheck %s
+#include <sycl/sycl.hpp>
+
+constexpr std::size_t N = 16;
+constexpr std::size_t group_size = 8;
+
+int main() {
+  sycl::queue Q;
+  auto *data = sycl::malloc_host<int>(1, Q);
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernel>(
+        sycl::nd_range<1>(N, group_size), [=](sycl::nd_item<1> item) {
+          sycl::multi_ptr<int[N], sycl::access::address_space::local_space>
+              ptr = sycl::ext::oneapi::group_local_memory<int[N]>(
+                  item.get_group());
+          auto &ref = *ptr;
+          ref[item.get_local_linear_id() * 2 + 4] = 42;
+          // CHECK: ERROR: DeviceSanitizer: out-of-bounds-access on Local Memory
+          // CHECK: {{WRITE of size 4 at kernel <.*MyKernel> LID\(6, 0, 0\) GID\(.*, 0, 0\)}}
+          // CHECK: {{  #0 .* .*local-overflow-1.cpp:}}[[@LINE-3]]
+        });
+  });
+
+  Q.wait();
+  return 0;
+}
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index 8c35fdfa2b65b..e1b2b4a1ef28d 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -686,3 +686,7 @@
     lit_config.maxIndividualTestTime = 600
 except ImportError:
     pass
+
+config.substitutions.append(
+    ("%device_sanitizer_flags", "-Xsycl-target-frontend -fsanitize=address")
+)

From e94b24718e60a7fa03ca1abbde4f7e37bbd0557d Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Mon, 5 Feb 2024 00:30:09 -0800
Subject: [PATCH 5/9] [SYCL] Replace `acc` with 'fpga' in tests using
 ONEAPI_DEVICE_SELECTOR (#12551)

As per the ONEAPI_DEVICE_SELECTOR
[documentation](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector),
the device type can only be cpu, gpu, or fpga (or any combination of
those). Currently, 'acc' is also accepted by ONEAPI_DEVICE_SELECTOR as a
valid device type, which is incorrect.

This PR modifies existing test cases to use 'fpga' instead of 'acc' in
ONEAPI_DEVICE_SELECTOR.
In future, we will completely drop 'acc' from device type in
ONEAPI_DEVICE_SELECTOR.

Partially addresses: https://github.com/intel/llvm/issues/12387
---
 sycl/test-e2e/FilterSelector/filter_list_cpu_gpu_acc.cpp  | 8 ++++----
 sycl/test-e2e/FilterSelector/select_device.cpp            | 4 ++--
 sycl/test-e2e/FilterSelector/select_device_acc.cpp        | 2 +-
 .../is_compatible/is_compatible_several_targets.cpp       | 2 +-
 sycl/test-e2e/Sampler/basic-rw.cpp                        | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sycl/test-e2e/FilterSelector/filter_list_cpu_gpu_acc.cpp b/sycl/test-e2e/FilterSelector/filter_list_cpu_gpu_acc.cpp
index ab3154380fe07..5e9022054cb33 100644
--- a/sycl/test-e2e/FilterSelector/filter_list_cpu_gpu_acc.cpp
+++ b/sycl/test-e2e/FilterSelector/filter_list_cpu_gpu_acc.cpp
@@ -10,14 +10,14 @@
 
 // RUN: %clangxx -fsycl %S/Inputs/filter_list_queries.cpp -o %t.out
 
-// RUN: env ONEAPI_DEVICE_SELECTOR="*:acc" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-ONLY
+// RUN: env ONEAPI_DEVICE_SELECTOR="*:fpga" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-ONLY
 // RUN: env ONEAPI_DEVICE_SELECTOR="*:gpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-GPU-ONLY
 // RUN: env ONEAPI_DEVICE_SELECTOR="*:cpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-CPU-ONLY
 //
-// RUN: env ONEAPI_DEVICE_SELECTOR="*:acc,gpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-GPU
-// RUN: env ONEAPI_DEVICE_SELECTOR="*:acc,cpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-CPU
+// RUN: env ONEAPI_DEVICE_SELECTOR="*:fpga,gpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-GPU
+// RUN: env ONEAPI_DEVICE_SELECTOR="*:fpga,cpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-CPU
 //
-// RUN: env ONEAPI_DEVICE_SELECTOR="*:cpu,acc,gpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-GPU-CPU
+// RUN: env ONEAPI_DEVICE_SELECTOR="*:cpu,fpga,gpu" %{run-unfiltered-devices} %t.out | FileCheck %s --check-prefixes=CHECK-ACC-GPU-CPU
 //
 // CHECK-ACC-ONLY: Device: acc
 // CHECK-ACC-ONLY-NOT: Device: cpu
diff --git a/sycl/test-e2e/FilterSelector/select_device.cpp b/sycl/test-e2e/FilterSelector/select_device.cpp
index 7516545ab59fd..ddb725b0523a4 100644
--- a/sycl/test-e2e/FilterSelector/select_device.cpp
+++ b/sycl/test-e2e/FilterSelector/select_device.cpp
@@ -4,7 +4,7 @@
 // RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run-unfiltered-devices} %t.out
 // RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run-unfiltered-devices} %t.out
 // RUN: env ONEAPI_DEVICE_SELECTOR='*:cpu;level_zero:gpu' %{run-unfiltered-devices} %t.out
-// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:acc %{run-unfiltered-devices} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:fpga %{run-unfiltered-devices} %t.out
 //
 // Checks if only specified device types can be acquired from select_device
 // when ONEAPI_DEVICE_SELECTOR is set
@@ -45,7 +45,7 @@ int main() {
     device d = cs.select_device();
   }
   if (!envVal || forcedPIs == "*" ||
-      forcedPIs.find("acc") != std::string::npos) {
+      forcedPIs.find("fpga") != std::string::npos) {
     accelerator_selector as;
     device d = as.select_device();
   }
diff --git a/sycl/test-e2e/FilterSelector/select_device_acc.cpp b/sycl/test-e2e/FilterSelector/select_device_acc.cpp
index 38d97652f8441..f5b94b535bbd4 100644
--- a/sycl/test-e2e/FilterSelector/select_device_acc.cpp
+++ b/sycl/test-e2e/FilterSelector/select_device_acc.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -o %t.out
-// RUN: env ONEAPI_DEVICE_SELECTOR='*:acc' %{run-unfiltered-devices} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR='*:fpga' %{run-unfiltered-devices} %t.out
 //
 // Checks if only specified device types can be acquired from select_device
 // when ONEAPI_DEVICE_SELECTOR is set
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
index 6dcc4690880d6..042874b90290c 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
@@ -3,6 +3,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga,spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out
 
 // RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
-// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:acc %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:fpga %{run} %t.out
 // RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} %t.out
 // RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run} %t.out
diff --git a/sycl/test-e2e/Sampler/basic-rw.cpp b/sycl/test-e2e/Sampler/basic-rw.cpp
index 4e5aae87de067..f73ce7e391ea0 100644
--- a/sycl/test-e2e/Sampler/basic-rw.cpp
+++ b/sycl/test-e2e/Sampler/basic-rw.cpp
@@ -15,8 +15,8 @@
     ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./binx.bin
     ONEAPI_DEVICE_SELECTOR=opencl:cpu ./binx.bin
 
-    ONEAPI_DEVICE_SELECTOR=opecl:acc ../binx.bin    <--  does not support image
-   operations at this time.
+    ONEAPI_DEVICE_SELECTOR=opencl:fpga ../binx.bin    <--  does not support
+   image operations at this time.
 
 */
 

From 015deb19a7db1dde8f8376b3b7d5e94c7642ca93 Mon Sep 17 00:00:00 2001
From: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
Date: Mon, 5 Feb 2024 13:06:22 +0100
Subject: [PATCH 6/9] [SYCL][Doc][NFC] Rename 'fallback' to 'generic' in two
 specs (#12544)

The change was made based on a comment
https://github.com/intel/llvm/pull/12259#pullrequestreview-1843169980
---
 .../experimental/sycl_ext_oneapi_device_architecture.asciidoc | 4 ++--
 .../extensions/proposed/sycl_ext_oneapi_device_if.asciidoc    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
index dadccc099e016..16a950af3151a 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
@@ -973,7 +973,7 @@ int main() {
     syclex::if_architecture_is<syclex::architecture::intel_gpu_pvc>([&]{
       // Code for PVC
     }).otherwise([&]{
-      // Fallback code
+      // Generic code
     });
   });
 
@@ -987,7 +987,7 @@ int main() {
                                        syclex::architecture::amd_gpu_gfx1013>([&]{
       // Code for AMD devices between gfx1010 and gfx1013 (inclusive)
     }).otherwise([&]{
-      // Fallback code
+      // Generic code
     });
   });
 }
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_device_if.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_device_if.asciidoc
index 8d2b31b3ecd32..73c9c5a9ac9bc 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_device_if.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_device_if.asciidoc
@@ -152,7 +152,7 @@ executes the `if_device_has` function has **all** of the aspects listed in this
 pack.  If the condition is `true`, the implementation calls `fn`.  Otherwise,
 the function `fn` is potentially discarded as described below.
 
-=== Fallback code
+=== Generic code
 
 The value returned by `if_device_has` is an object _F_ of an unspecified type,
 which provides the following member functions:
@@ -200,7 +200,7 @@ void frob() {
   }).else_if_device_has<aspect::bar>([] {
     // code that uses features tied to "bar" aspect
   }).otherwise([] {
-    // fallback code that works on all devices
+    // generic code that works on all devices
   });
 }
 ```

From b781e6ca1fae2fe0b7bbf5be406119c0f1261257 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Mon, 5 Feb 2024 13:25:35 +0000
Subject: [PATCH 7/9] [SYCL][CUDA][HIP] Remove CUDA and HIP PI unit tests
 (#12459)

These tests are not currently running and are covered in other test
suites:

* `test_primary_context.cpp`
  * Deprecated feature, covered in `test-e2e/Basic/context.cpp`
* `test_commands.cpp`
  * Covered by UR CTS
* `test_sampler_properties.cpp`
* Covered by UR CTS:
https://github.com/oneapi-src/unified-runtime/tree/main/test/conformance/sampler
* `PlatformTest.cpp`
* Covered by UR CTS:
https://github.com/oneapi-src/unified-runtime/blob/main/test/conformance/platform/urPlatformGetInfo.cpp
* `test_device.cpp`
* Covered by UR CTS:
https://github.com/oneapi-src/unified-runtime/blob/main/test/conformance/device/urDeviceGetInfo.cpp
* `EnqueueMemTest.cpp`
* Covered by UR CTS:
https://github.com/oneapi-src/unified-runtime/blob/main/test/conformance/enqueue/urEnqueueMemBufferFill.cpp
* `test_mem_obj.cpp`
  * Moved to UR CTS
* `test_contexts.cpp`
*
https://github.com/oneapi-src/unified-runtime/blob/main/test/adapters/cuda/context_tests.cpp
* `test_kernels.cpp`
*
https://github.com/oneapi-src/unified-runtime/blob/main/test/adapters/cuda/kernel_tests.cpp
* `test_base_objects.cpp`
  * Basic tests mostly covered in UR
* `test_interop_get_native.cpp`
  * Mostly covered in UR tests and E2E tests

After this both the CUDA and HIP directories could be removed. There are
two PI tests remaining, one with regards to xpti handling of PI call
arguments, and one regarding OpenCL interop ownership.
---
 sycl/unittests/pi/CMakeLists.txt              |  10 -
 sycl/unittests/pi/EnqueueMemTest.cpp          | 152 ------
 sycl/unittests/pi/PlatformTest.cpp            | 113 -----
 sycl/unittests/pi/cuda/CMakeLists.txt         |  32 --
 sycl/unittests/pi/cuda/CudaUtils.hpp          |  20 -
 sycl/unittests/pi/cuda/test_base_objects.cpp  | 139 ------
 sycl/unittests/pi/cuda/test_commands.cpp      | 145 ------
 sycl/unittests/pi/cuda/test_contexts.cpp      | 250 ----------
 sycl/unittests/pi/cuda/test_device.cpp        | 111 -----
 .../pi/cuda/test_interop_get_native.cpp       | 137 -----
 sycl/unittests/pi/cuda/test_kernels.cpp       | 466 ------------------
 sycl/unittests/pi/cuda/test_mem_obj.cpp       | 207 --------
 .../pi/cuda/test_primary_context.cpp          |  94 ----
 .../pi/cuda/test_sampler_properties.cpp       | 135 -----
 sycl/unittests/pi/hip/CMakeLists.txt          |  42 --
 sycl/unittests/pi/hip/HipUtils.hpp            |  20 -
 sycl/unittests/pi/hip/test_base_objects.cpp   | 141 ------
 sycl/unittests/pi/hip/test_commands.cpp       | 145 ------
 sycl/unittests/pi/hip/test_contexts.cpp       | 250 ----------
 sycl/unittests/pi/hip/test_device.cpp         | 111 -----
 .../pi/hip/test_interop_get_native.cpp        | 127 -----
 sycl/unittests/pi/hip/test_kernels.cpp        |  79 ---
 sycl/unittests/pi/hip/test_mem_obj.cpp        | 206 --------
 .../unittests/pi/hip/test_primary_context.cpp |  85 ----
 .../pi/hip/test_sampler_properties.cpp        | 135 -----
 25 files changed, 3352 deletions(-)
 delete mode 100644 sycl/unittests/pi/EnqueueMemTest.cpp
 delete mode 100644 sycl/unittests/pi/PlatformTest.cpp
 delete mode 100644 sycl/unittests/pi/cuda/CMakeLists.txt
 delete mode 100644 sycl/unittests/pi/cuda/CudaUtils.hpp
 delete mode 100644 sycl/unittests/pi/cuda/test_base_objects.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_commands.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_contexts.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_device.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_interop_get_native.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_kernels.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_mem_obj.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_primary_context.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_sampler_properties.cpp
 delete mode 100644 sycl/unittests/pi/hip/CMakeLists.txt
 delete mode 100644 sycl/unittests/pi/hip/HipUtils.hpp
 delete mode 100644 sycl/unittests/pi/hip/test_base_objects.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_commands.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_contexts.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_device.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_interop_get_native.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_kernels.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_mem_obj.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_primary_context.cpp
 delete mode 100644 sycl/unittests/pi/hip/test_sampler_properties.cpp

diff --git a/sycl/unittests/pi/CMakeLists.txt b/sycl/unittests/pi/CMakeLists.txt
index 0c78c9a634010..861fc41069c7e 100644
--- a/sycl/unittests/pi/CMakeLists.txt
+++ b/sycl/unittests/pi/CMakeLists.txt
@@ -1,9 +1,7 @@
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 add_sycl_unittest(PiTests OBJECT
-  EnqueueMemTest.cpp
   PiMock.cpp
-  PlatformTest.cpp
   PiUtility.cpp
   pi_arguments_handler.cpp
   piInteropRetain.cpp
@@ -13,11 +11,3 @@ add_dependencies(PiTests sycl)
 target_include_directories(PiTests PRIVATE SYSTEM ${sycl_inc_dir})
 target_include_directories(PiTests PRIVATE ${sycl_src_dir}/../tools/xpti_helpers)
 
-if("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
-    add_subdirectory(cuda)
-endif()
-
-if("hip" IN_LIST SYCL_ENABLE_PLUGINS)
-    add_subdirectory(hip)
-endif()
-
diff --git a/sycl/unittests/pi/EnqueueMemTest.cpp b/sycl/unittests/pi/EnqueueMemTest.cpp
deleted file mode 100644
index d6439654f7bbb..0000000000000
--- a/sycl/unittests/pi/EnqueueMemTest.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//==---- EnqueueMemTest.cpp --- PI unit tests ------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <gtest/gtest.h>
-#include <sycl/detail/pi.hpp>
-
-using namespace sycl;
-
-namespace {
-class EnqueueMemTest : public testing::TestWithParam<detail::PluginPtr> {
-protected:
-  constexpr static size_t _numElementsX = 8;
-  constexpr static size_t _numElementsY = 4;
-
-  pi_device _device = nullptr;
-  pi_context _context = nullptr;
-  pi_queue _queue = nullptr;
-  pi_mem _mem = nullptr;
-
-  EnqueueMemTest() = default;
-
-  ~EnqueueMemTest() = default;
-
-  void SetUp() override {
-
-    const detail::PluginPtr &plugin = GetParam();
-
-    pi_platform platform = nullptr;
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  1, &platform, nullptr)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform, PI_DEVICE_TYPE_DEFAULT, 1, &_device, nullptr)),
-              PI_SUCCESS);
-
-    pi_result result = PI_ERROR_INVALID_VALUE;
-    result = plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-        nullptr, 1u, &_device, nullptr, nullptr, &_context);
-    ASSERT_EQ(result, PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  _context, _device, 0, &_queue)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                  _context, PI_MEM_FLAGS_ACCESS_RW,
-                  _numElementsX * _numElementsY * sizeof(pi_int32), nullptr,
-                  &_mem, nullptr)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {
-
-    const detail::PluginPtr &plugin = GetParam();
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(_mem)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueRelease>(_queue)),
-              PI_SUCCESS);
-    ASSERT_EQ(
-        (plugin->call_nocheck<detail::PiApiKind::piContextRelease>(_context)),
-        PI_SUCCESS);
-  }
-
-  template <typename T> void TestBufferFill(const T &pattern) {
-
-    const detail::PluginPtr &plugin = GetParam();
-
-    T inValues[_numElementsX] = {};
-
-    for (size_t i = 0; i < _numElementsX; ++i) {
-      ASSERT_NE(pattern, inValues[i]);
-    }
-
-    pi_event event;
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                  _queue, _mem, PI_TRUE, 0, _numElementsX * sizeof(T), inValues,
-                  0, nullptr, &event)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferFill>(
-                  _queue, _mem, &pattern, sizeof(T), 0, sizeof(inValues), 0,
-                  nullptr, &event)),
-              PI_SUCCESS);
-    ASSERT_EQ(
-        (plugin->call_nocheck<detail::PiApiKind::piEventsWait>(1, &event)),
-        PI_SUCCESS);
-
-    T outValues[_numElementsX] = {};
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                  _queue, _mem, PI_TRUE, 0, _numElementsX * sizeof(T),
-                  outValues, 0, nullptr, &event)),
-              PI_SUCCESS);
-
-    for (size_t i = 0; i < _numElementsX; ++i) {
-      ASSERT_EQ(pattern, outValues[i]);
-    }
-  }
-};
-
-INSTANTIATE_TEST_SUITE_P(
-    EnqueueMemTestImpl, EnqueueMemTest,
-    testing::ValuesIn(pi::initializeAndRemoveInvalid()),
-    [](const testing::TestParamInfo<EnqueueMemTest::ParamType> &info) {
-      return pi::GetBackendString(info.param);
-    });
-
-template <typename T> struct vec4 {
-  T x, y, z, w;
-
-  bool operator==(const vec4 &rhs) const {
-    return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
-  }
-
-  bool operator!=(const vec4 &rhs) const { return !(*this == rhs); }
-};
-
-template <typename T> struct vec2 {
-  T x, y;
-
-  bool operator==(const vec2 &rhs) const { return x == rhs.x && y == rhs.y; }
-
-  bool operator!=(const vec2 &rhs) const { return !(*this == rhs); }
-};
-
-TEST_P(EnqueueMemTest, piEnqueueMemBufferFill) {
-
-  TestBufferFill(float{1});
-  TestBufferFill(vec2<float>{1, 2});
-  TestBufferFill(vec4<float>{1, 2, 3, 4});
-
-  TestBufferFill(uint8_t{1});
-  TestBufferFill(vec2<uint8_t>{1, 2});
-  TestBufferFill(vec4<uint8_t>{1, 2, 3, 4});
-
-  TestBufferFill(uint16_t{1});
-  TestBufferFill(vec2<uint16_t>{1, 2});
-  TestBufferFill(vec4<uint16_t>{1, 2, 3, 4});
-
-  TestBufferFill(uint32_t{1});
-  TestBufferFill(vec2<uint32_t>{1, 2});
-  TestBufferFill(vec4<uint32_t>{1, 2, 3, 4});
-}
-} // namespace
diff --git a/sycl/unittests/pi/PlatformTest.cpp b/sycl/unittests/pi/PlatformTest.cpp
deleted file mode 100644
index 61834dbb14fff..0000000000000
--- a/sycl/unittests/pi/PlatformTest.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-//==---- PlatformTest.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <gtest/gtest.h>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-namespace {
-
-using namespace sycl;
-
-class PlatformTest : public testing::TestWithParam<detail::PluginPtr> {
-protected:
-  std::vector<pi_platform> _platforms;
-  PlatformTest() : _platforms{} {};
-
-  ~PlatformTest() override = default;
-
-  void SetUp() override {
-
-    const detail::PluginPtr &plugin = GetParam();
-
-    ASSERT_NO_FATAL_FAILURE(Test::SetUp());
-
-    const static char *platform_count_key = "PiPlatformCount";
-
-    pi_uint32 platform_count = 0u;
-
-    // Initialize the logged number of platforms before the following assertion.
-    RecordProperty(platform_count_key, platform_count);
-
-    // TODO: Change the test to check this for all plugins present.
-    // Currently, it is only checking for the first plugin attached.
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &platform_count)),
-              PI_SUCCESS);
-
-    // Overwrite previous log value with queried number of platforms.
-    RecordProperty(platform_count_key, platform_count);
-
-    if (platform_count == 0u) {
-      std::cout << "WARNING: piPlatformsGet does not find any PI platforms.\n";
-
-      // Do not call into OpenCL below as a platform count of 0 might fail with
-      // OpenCL implementations if the platforms pointer is not `nullptr`.
-      return;
-    }
-
-    _platforms.resize(platform_count, nullptr);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  _platforms.size(), _platforms.data(), nullptr)),
-              PI_SUCCESS);
-  }
-};
-
-INSTANTIATE_TEST_SUITE_P(
-    PlatformTestImpl, PlatformTest,
-    testing::ValuesIn(pi::initializeAndRemoveInvalid()),
-    [](const testing::TestParamInfo<PlatformTest::ParamType> &info) {
-      return pi::GetBackendString(info.param);
-    });
-
-TEST_P(PlatformTest, piPlatformsGet) {
-  // The PlatformTest::SetUp method is called to prepare for this test case
-  // implicitly tests the calls to `piPlatformsGet`.
-}
-
-TEST_P(PlatformTest, piPlatformGetInfo) {
-
-  const detail::PluginPtr &plugin = GetParam();
-
-  auto get_info_test = [&](pi_platform platform, _pi_platform_info info) {
-    size_t reported_string_length = 0;
-    EXPECT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformGetInfo>(
-                  platform, info, 0u, nullptr, &reported_string_length)),
-              PI_SUCCESS);
-
-    // Create a larger result string to catch overwrites.
-    std::vector<char> param_value(reported_string_length * 2u, '\0');
-    EXPECT_EQ(
-        (plugin->call_nocheck<detail::PiApiKind::piPlatformGetInfo>(
-            platform, info, param_value.size(), param_value.data(), nullptr)),
-        PI_SUCCESS)
-        << "piPlatformGetInfo for " << detail::pi::platformInfoToString(info)
-        << " failed.\n";
-
-    const auto returned_string_length = strlen(param_value.data()) + 1;
-
-    EXPECT_EQ(returned_string_length, reported_string_length)
-        << "Returned string length " << returned_string_length
-        << " does not equal reported string length " << reported_string_length
-        << ".\n";
-  };
-
-  for (const auto &platform : _platforms) {
-    get_info_test(platform, PI_PLATFORM_INFO_NAME);
-    get_info_test(platform, PI_PLATFORM_INFO_VENDOR);
-    get_info_test(platform, PI_PLATFORM_INFO_PROFILE);
-    get_info_test(platform, PI_PLATFORM_INFO_VERSION);
-    get_info_test(platform, PI_PLATFORM_INFO_EXTENSIONS);
-    get_info_test(platform, PI_EXT_PLATFORM_INFO_BACKEND);
-  }
-}
-} // namespace
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
deleted file mode 100644
index 7808340cc4302..0000000000000
--- a/sycl/unittests/pi/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-add_sycl_unittest(PiCudaTests OBJECT
-  test_base_objects.cpp
-  test_commands.cpp
-  test_contexts.cpp
-  test_device.cpp
-  test_interop_get_native.cpp
-  test_kernels.cpp
-  test_mem_obj.cpp
-  test_primary_context.cpp
-  test_sampler_properties.cpp
-)
-
-add_dependencies(PiCudaTests sycl)
-
-target_compile_definitions(PiCudaTests
-  PRIVATE
-    GTEST_HAS_COMBINE=1)
-
-target_include_directories(PiCudaTests
-  PRIVATE
-    "../"
-    "${sycl_inc_dir}/sycl/detail/"
-    "${sycl_inc_dir}"
-    "${sycl_plugin_dir}/cuda/"
-    "${sycl_plugin_dir}/unified_runtime/"
-)
-
-target_link_libraries(PiCudaTests
-  PRIVATE
-    cudadrv
-    UnifiedRuntime-Headers
-)
diff --git a/sycl/unittests/pi/cuda/CudaUtils.hpp b/sycl/unittests/pi/cuda/CudaUtils.hpp
deleted file mode 100644
index f7cb8b40492d3..0000000000000
--- a/sycl/unittests/pi/cuda/CudaUtils.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#pragma once
-
-#include <cuda.h>
-
-namespace pi {
-
-// utility function to clear the CUDA context stack
-inline void clearCudaContext() {
-  CUcontext ctxt = nullptr;
-  do {
-    cuCtxSetCurrent(nullptr);
-    cuCtxGetCurrent(&ctxt);
-  } while (ctxt != nullptr);
-}
-
-} // namespace pi
diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp
deleted file mode 100644
index d0799a08cfff3..0000000000000
--- a/sycl/unittests/pi/cuda/test_base_objects.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-//==---- test_base_objects.cpp --- PI unit tests ---------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/cuda_definitions.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-#include <thread>
-
-const unsigned int LATEST_KNOWN_CUDA_DRIVER_API_VERSION = 3020u;
-
-using namespace sycl;
-
-class CudaBaseObjectsTest : public ::testing::Test {
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-  }
-
-  CudaBaseObjectsTest() = default;
-
-  ~CudaBaseObjectsTest() = default;
-};
-
-TEST_F(CudaBaseObjectsTest, piContextCreate) {
-  pi_uint32 numPlatforms = 0;
-  pi_platform platform = nullptr;
-  pi_device device;
-  ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                0, nullptr, &numPlatforms)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                numPlatforms, &platform, nullptr)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_GE(numPlatforms, 1u);
-  ASSERT_NE(platform, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
-            PI_SUCCESS)
-      << "piDevicesGet failed.\n";
-
-  pi_context ctxt = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device, nullptr, nullptr, &ctxt)),
-            PI_SUCCESS)
-      << "piContextCreate failed.\n";
-
-  EXPECT_NE(ctxt, nullptr);
-  EXPECT_EQ(ctxt->get_device(), device);
-
-  // Retrieve the cuCtxt to check information is correct
-  CUcontext cudaContext = ctxt->get();
-  unsigned int version = 0;
-  cuCtxGetApiVersion(cudaContext, &version);
-  EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaBaseObjectsTest, piContextCreateChildThread) {
-  pi_uint32 numPlatforms = 0;
-  pi_platform platform;
-  pi_device device;
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                0, nullptr, &numPlatforms)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                numPlatforms, &platform, nullptr)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
-            PI_SUCCESS);
-
-  pi_context ctxt;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device, nullptr, nullptr, &ctxt)),
-            PI_SUCCESS);
-  EXPECT_NE(ctxt, nullptr);
-
-  // Retrieve the cuCtxt to check information is correct
-  auto checkValue = [=]() {
-    CUcontext cudaContext = ctxt->get();
-    unsigned int version = 0;
-    auto cuErr = cuCtxGetApiVersion(cudaContext, &version);
-    EXPECT_EQ(cuErr, CUDA_SUCCESS);
-    EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);
-
-    // The current context is different from the current thread
-    CUcontext current;
-    cuErr = cuCtxGetCurrent(&current);
-    EXPECT_EQ(cuErr, CUDA_SUCCESS);
-    EXPECT_NE(cudaContext, current);
-
-    // Set the context from PI API as the current one
-    cuErr = cuCtxPushCurrent(cudaContext);
-    EXPECT_EQ(cuErr, CUDA_SUCCESS);
-
-    cuErr = cuCtxGetCurrent(&current);
-    EXPECT_EQ(cuErr, CUDA_SUCCESS);
-    EXPECT_EQ(cudaContext, current);
-  };
-  auto callContextFromOtherThread = std::thread(checkValue);
-
-  callContextFromOtherThread.join();
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
-            PI_SUCCESS);
-}
diff --git a/sycl/unittests/pi/cuda/test_commands.cpp b/sycl/unittests/pi/cuda/test_commands.cpp
deleted file mode 100644
index 6c794fe51c899..0000000000000
--- a/sycl/unittests/pi/cuda/test_commands.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//==---- test_commands.cpp --- PI unit tests -------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "CudaUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct CudaCommandsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_queue queue_;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi::clearCudaContext();
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    ASSERT_NE(context_, nullptr);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context_, device_, 0, &queue_)),
-              PI_SUCCESS);
-    ASSERT_NE(queue_, nullptr);
-    auto tmpCtxt = queue_->get_context();
-    ASSERT_EQ(tmpCtxt, context_);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piQueueRelease>(queue_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  CudaCommandsTest() = default;
-
-  ~CudaCommandsTest() = default;
-};
-
-TEST_F(CudaCommandsTest, PIEnqueueReadBufferBlocking) {
-  constexpr const size_t memSize = 10u;
-  constexpr const size_t bytes = memSize * sizeof(int);
-  const int data[memSize] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[memSize] = {};
-
-  pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                queue_, memObj, true, 0, bytes, data, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue_, memObj, true, 0, bytes, output, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  bool isSame =
-      std::equal(std::begin(output), std::end(output), std::begin(data));
-  EXPECT_TRUE(isSame);
-  if (!isSame) {
-    std::for_each(std::begin(output), std::end(output),
-                  [](int &elem) { std::cout << elem << ","; });
-    std::cout << std::endl;
-  }
-}
-
-TEST_F(CudaCommandsTest, PIEnqueueReadBufferNonBlocking) {
-  constexpr const size_t memSize = 10u;
-  constexpr const size_t bytes = memSize * sizeof(int);
-  const int data[memSize] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[memSize] = {};
-
-  pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
-
-  pi_event cpIn, cpOut;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                queue_, memObj, false, 0, bytes, data, 0, nullptr, &cpIn)),
-            PI_SUCCESS);
-  ASSERT_NE(cpIn, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue_, memObj, false, 0, bytes, output, 0, nullptr, &cpOut)),
-            PI_SUCCESS);
-  ASSERT_NE(cpOut, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEventsWait>(1, &cpOut)),
-            PI_SUCCESS);
-
-  bool isSame =
-      std::equal(std::begin(output), std::end(output), std::begin(data));
-  EXPECT_TRUE(isSame);
-  if (!isSame) {
-    std::for_each(std::begin(output), std::end(output),
-                  [](int &elem) { std::cout << elem << ","; });
-    std::cout << std::endl;
-  }
-}
diff --git a/sycl/unittests/pi/cuda/test_contexts.cpp b/sycl/unittests/pi/cuda/test_contexts.cpp
deleted file mode 100644
index 7113537ebf147..0000000000000
--- a/sycl/unittests/pi/cuda/test_contexts.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-//==---- test_contexts.cpp --- PI unit tests -------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-
-#include <cuda.h>
-
-#include "CudaUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct CudaContextsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  pi_platform platform_;
-  pi_device device_;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {}
-
-  CudaContextsTest() = default;
-
-  ~CudaContextsTest() = default;
-};
-
-TEST_F(CudaContextsTest, ContextLifetime) {
-  // start with no active context
-  pi::clearCudaContext();
-
-  // create a context
-  pi_context context;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context)),
-            PI_SUCCESS);
-  ASSERT_NE(context, nullptr);
-
-  // create a queue from the context, this should use the ScopedContext
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-
-  // ensure the queue has the correct context
-  ASSERT_EQ(context, queue->get_context());
-
-  // check that the context is now the active CUDA context
-  CUcontext cudaCtxt = nullptr;
-  cuCtxGetCurrent(&cudaCtxt);
-  ASSERT_EQ(cudaCtxt, context->get());
-
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  plugin->call<detail::PiApiKind::piContextRelease>(context);
-
-  // check that the context was cleaned up properly by the destructor
-  cuCtxGetCurrent(&cudaCtxt);
-  ASSERT_EQ(cudaCtxt, nullptr);
-}
-
-TEST_F(CudaContextsTest, ContextLifetimeExisting) {
-  // start by setting up a CUDA context on the thread
-  CUcontext original;
-  cuCtxCreate(&original, CU_CTX_MAP_HOST, device_->get());
-
-  // ensure the CUDA context is active
-  CUcontext current = nullptr;
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(original, current);
-
-  // create a PI context
-  pi_context context;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context)),
-            PI_SUCCESS);
-  ASSERT_NE(context, nullptr);
-
-  // create a queue from the context, this should use the ScopedContext
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-
-  // ensure the queue has the correct context
-  ASSERT_EQ(context, queue->get_context());
-
-  // check that the context is now the active CUDA context
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, context->get());
-
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  plugin->call<detail::PiApiKind::piContextRelease>(context);
-
-  // check that the context was cleaned up, the old context will be restored
-  // automatically by cuCtxDestroy in piContextRelease, as it was pushed on the
-  // stack bu cuCtxCreate
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, original);
-
-  // release original context
-  cuCtxDestroy(original);
-}
-
-// In some cases (for host_task), the SYCL runtime may call PI API functions
-// from threads of the thread pool, this can cause issues because with the CUDA
-// plugin these functions will set an active CUDA context on these threads, but
-// never clean it up, as it will only get cleaned up in the main thread.
-//
-// So the following test aims to reproduce the scenario where there is a
-// dangling deleted context in a separate thread and seeing if the PI calls are
-// still able to work correctly in that thread.
-TEST_F(CudaContextsTest, ContextThread) {
-  // start with no active context
-  pi::clearCudaContext();
-
-  // create two PI contexts
-  pi_context context1;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context1)),
-            PI_SUCCESS);
-  ASSERT_NE(context1, nullptr);
-
-  pi_context context2;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context2)),
-            PI_SUCCESS);
-  ASSERT_NE(context2, nullptr);
-
-  // setup synchronization variables between the main thread and the testing
-  // thread
-  std::mutex m;
-  std::condition_variable cv;
-  bool released = false;
-  bool thread_done = false;
-
-  // create a testing thread that will create a queue with the first context,
-  // release the queue, then wait for the main thread to release the first
-  // context, and then create and release another queue with the second context
-  // this time
-  auto test_thread = std::thread([&] {
-    CUcontext current = nullptr;
-
-    // create a queue with the first context
-    pi_queue queue;
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context1, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context1, queue->get_context());
-
-    // check that the first context is now the active CUDA context
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-
-    // mark the first set of processing as done and notify the main thread
-    std::unique_lock<std::mutex> lock(m);
-    thread_done = true;
-    lock.unlock();
-    cv.notify_one();
-
-    // wait for the main thread to release the first context
-    lock.lock();
-    cv.wait(lock, [&] { return released; });
-
-    // check that the first context is still active, this is because deleting a
-    // context only cleans up the current thread
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    // create a queue with the second context
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context2, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context2, queue->get_context());
-
-    // check that the second context is now the active CUDA context
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context2->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  });
-
-  // wait for the thread to be done with the first queue to release the first
-  // context
-  std::unique_lock<std::mutex> lock(m);
-  cv.wait(lock, [&] { return thread_done; });
-  plugin->call<detail::PiApiKind::piContextRelease>(context1);
-
-  // notify the other thread that the context was released
-  released = true;
-  lock.unlock();
-  cv.notify_one();
-
-  // wait for the thread to finish
-  test_thread.join();
-
-  plugin->call<detail::PiApiKind::piContextRelease>(context2);
-
-  // check that there is no context set on the main thread
-  CUcontext current = nullptr;
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, nullptr);
-}
diff --git a/sycl/unittests/pi/cuda/test_device.cpp b/sycl/unittests/pi/cuda/test_device.cpp
deleted file mode 100644
index 9ddb62ca1a512..0000000000000
--- a/sycl/unittests/pi/cuda/test_device.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-//==---- test_device.cpp --- PI unit tests ---------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct CudaDeviceTests : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  CudaDeviceTests() = default;
-  ~CudaDeviceTests() = default;
-};
-
-TEST_F(CudaDeviceTests, PIDeviceGetInfoSimple) {
-
-  size_t return_size = 0;
-  pi_device_type device_type;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_TYPE, sizeof(pi_device_type),
-                &device_type, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_device_type));
-  EXPECT_EQ(
-      device_type,
-      PI_DEVICE_TYPE_GPU); // backend pre-defined value, device must be a GPU
-
-  pi_device parent_device = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PARENT_DEVICE, sizeof(pi_device),
-                &parent_device, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_device));
-  EXPECT_EQ(parent_device,
-            nullptr); // backend pre-set value, device cannot have a parent
-
-  pi_platform platform = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PLATFORM, sizeof(pi_platform),
-                &platform, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_platform));
-  EXPECT_EQ(platform, platform_); // test fixture device was created from the
-                                  // test fixture platform
-
-  cl_device_partition_property device_partition_property = -1;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PARTITION_TYPE,
-                sizeof(cl_device_partition_property),
-                &device_partition_property, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(device_partition_property,
-            0); // PI CUDA backend will not support device partitioning, this
-                // function should just return 0.
-  EXPECT_EQ(return_size, sizeof(cl_device_partition_property));
-}
diff --git a/sycl/unittests/pi/cuda/test_interop_get_native.cpp b/sycl/unittests/pi/cuda/test_interop_get_native.cpp
deleted file mode 100644
index 903d44043cda6..0000000000000
--- a/sycl/unittests/pi/cuda/test_interop_get_native.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//==------- test_interop_get_native.cpp - SYCL CUDA get_native tests -------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlatforms.hpp"
-#include <sycl/sycl.hpp>
-
-#include <iostream>
-
-using namespace sycl;
-
-struct CudaInteropGetNativeTests : public ::testing::TestWithParam<platform> {
-
-protected:
-  std::unique_ptr<queue> syclQueue_;
-  device syclDevice_;
-
-  void SetUp() override {
-    syclDevice_ = GetParam().get_devices()[0];
-    syclQueue_ = std::unique_ptr<queue>{new queue{syclDevice_}};
-  }
-
-  void TearDown() override { syclQueue_.reset(); }
-};
-
-TEST_P(CudaInteropGetNativeTests, getNativeDevice) {
-  CUdevice cudaDevice = get_native<backend::ext_oneapi_cuda>(syclDevice_);
-  char cudaDeviceName[2] = {0, 0};
-  CUresult result = cuDeviceGetName(cudaDeviceName, 2, cudaDevice);
-  ASSERT_EQ(result, CUDA_SUCCESS);
-  ASSERT_NE(cudaDeviceName[0], 0);
-}
-
-TEST_P(CudaInteropGetNativeTests, getNativeContext) {
-  CUcontext cudaContext =
-      get_native<backend::ext_oneapi_cuda>(syclQueue_->get_context());
-  ASSERT_NE(cudaContext, nullptr);
-}
-
-TEST_P(CudaInteropGetNativeTests, getNativeQueue) {
-  CUstream cudaStream = get_native<backend::ext_oneapi_cuda>(*syclQueue_);
-  ASSERT_NE(cudaStream, nullptr);
-
-  CUcontext streamContext = nullptr;
-  CUresult result = cuStreamGetCtx(cudaStream, &streamContext);
-  ASSERT_EQ(result, CUDA_SUCCESS);
-
-  CUcontext cudaContext =
-      get_native<backend::ext_oneapi_cuda>(syclQueue_->get_context());
-  ASSERT_EQ(streamContext, cudaContext);
-}
-
-TEST_P(CudaInteropGetNativeTests, interopTaskGetMem) {
-  buffer<int, 1> syclBuffer(range<1>{1});
-  syclQueue_->submit([&](handler &cgh) {
-    auto syclAccessor = syclBuffer.get_access<access::mode::read>(cgh);
-    cgh.host_task([=](interop_handle ih) {
-      CUdeviceptr cudaPtr =
-          ih.get_native_mem<backend::ext_oneapi_cuda>(syclAccessor);
-      CUdeviceptr cudaPtrBase;
-      size_t cudaPtrSize = 0;
-      CUcontext cudaContext =
-          get_native<backend::ext_oneapi_cuda>(syclQueue_->get_context());
-      ASSERT_EQ(CUDA_SUCCESS, cuCtxPushCurrent(cudaContext));
-      ASSERT_EQ(CUDA_SUCCESS,
-                cuMemGetAddressRange(&cudaPtrBase, &cudaPtrSize, cudaPtr));
-      ASSERT_EQ(CUDA_SUCCESS, cuCtxPopCurrent(nullptr));
-      ASSERT_EQ(sizeof(int), cudaPtrSize);
-    });
-  });
-}
-
-TEST_P(CudaInteropGetNativeTests, interopTaskGetQueue) {
-  CUstream cudaStream = get_native<backend::ext_oneapi_cuda>(*syclQueue_);
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      CUstream cudaInteropStream =
-          ih.get_native_queue<backend::ext_oneapi_cuda>();
-      ASSERT_EQ(cudaInteropStream, cudaStream);
-    });
-  });
-}
-
-TEST_P(CudaInteropGetNativeTests, hostTaskGetNativeMem) {
-  buffer<int, 1> syclBuffer(range<1>{1});
-  syclQueue_->submit([&](handler &cgh) {
-    auto syclAccessor = syclBuffer.get_access<access::mode::read>(cgh);
-    cgh.host_task([=](interop_handle ih) {
-      CUdeviceptr cudaPtr =
-          ih.get_native_mem<backend::ext_oneapi_cuda>(syclAccessor);
-      CUdeviceptr cudaPtrBase;
-      size_t cudaPtrSize = 0;
-      CUcontext cudaContext =
-          get_native<backend::ext_oneapi_cuda>(syclQueue_->get_context());
-      ASSERT_EQ(CUDA_SUCCESS, cuCtxPushCurrent(cudaContext));
-      ASSERT_EQ(CUDA_SUCCESS,
-                cuMemGetAddressRange(&cudaPtrBase, &cudaPtrSize, cudaPtr));
-      ASSERT_EQ(CUDA_SUCCESS, cuCtxPopCurrent(nullptr));
-      ASSERT_EQ(sizeof(int), cudaPtrSize);
-    });
-  });
-}
-
-TEST_P(CudaInteropGetNativeTests, hostTaskGetNativeQueue) {
-  CUstream cudaStream = get_native<backend::ext_oneapi_cuda>(*syclQueue_);
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      CUstream cudaInteropStream =
-          ih.get_native_queue<backend::ext_oneapi_cuda>();
-      ASSERT_EQ(cudaInteropStream, cudaStream);
-    });
-  });
-}
-
-TEST_P(CudaInteropGetNativeTests, hostTaskGetNativeContext) {
-  CUcontext cudaContext =
-      get_native<backend::ext_oneapi_cuda>(syclQueue_->get_context());
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      CUcontext cudaInteropContext =
-          ih.get_native_context<backend::ext_oneapi_cuda>();
-      ASSERT_EQ(cudaInteropContext, cudaContext);
-    });
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    OnCudaPlatform, CudaInteropGetNativeTests,
-    ::testing::ValuesIn(pi::getPlatformsWithName("CUDA BACKEND")));
diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp
deleted file mode 100644
index 736e266b6566e..0000000000000
--- a/sycl/unittests/pi/cuda/test_kernels.cpp
+++ /dev/null
@@ -1,466 +0,0 @@
-//==---- test_kernels.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-// PI CUDA kernels carry an additional argument for the implicit global offset.
-#define NUM_IMPLICIT_ARGS 1
-
-using namespace sycl;
-
-struct CudaKernelsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_queue queue_;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    ASSERT_NE(context_, nullptr);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context_, device_, 0, &queue_)),
-              PI_SUCCESS);
-    ASSERT_NE(queue_, nullptr);
-    ASSERT_EQ(queue_->get_context(), context_);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piQueueRelease>(queue_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  CudaKernelsTest() = default;
-
-  ~CudaKernelsTest() = default;
-};
-
-const char *ptxSource = "\n\
-.version 3.2\n\
-.target sm_20\n\
-.address_size 64\n\
-.visible .entry _Z8myKernelPi(\n\
-	.param .u64 _Z8myKernelPi_param_0\n\
-)\n\
-{\n\
-	.reg .s32 	%r<5>;\n\
-	.reg .s64 	%rd<5>;\n\
-	ld.param.u64 	%rd1, [_Z8myKernelPi_param_0];\n\
-	cvta.to.global.u64 	%rd2, %rd1;\n\
-	.loc 1 3 1\n\
-	mov.u32 	%r1, %ntid.x;\n\
-	mov.u32 	%r2, %ctaid.x;\n\
-	mov.u32 	%r3, %tid.x;\n\
-	mad.lo.s32 	%r4, %r1, %r2, %r3;\n\
-	mul.wide.s32 	%rd3, %r4, 4;\n\
-	add.s64 	%rd4, %rd2, %rd3;\n\
-	.loc 1 4 1\n\
-	st.global.u32 	[%rd4], %r4;\n\
-	.loc 1 5 2\n\
-	ret;\n\
-    ret;\
-\n\
-}\
-\n\
-";
-
-const char *twoParams = "\n\
-.version 3.2\n\
-.target sm_20\n\
-.address_size 64\n\
-.visible .entry twoParamKernel(\n\
-	.param .u64 twoParamKernel_param_0,\n\
-  .param .u64 twoParamKernel_param_1\n\
-)\n\
-{\n\
-  ret;\
-  \n\
-}\n\
-";
-
-const char *threeParamsTwoLocal = "\n\
-.version 3.2\n\
-.target sm_20\n\
-.address_size 64\n\
-.visible .entry twoParamKernelLocal(\n\
-	.param .u64 twoParamKernel_param_0,\n\
-  .param .u32 twoParamKernel_param_1,\n\
-  .param .u32 twoParamKernel_param_2\n\
-)\n\
-{\n\
-  ret;\
-  \n\
-}\n\
-";
-
-TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-  ASSERT_NE(kern, nullptr);
-}
-
-TEST_F(CudaKernelsTest, PICreateProgramAndKernelWithMetadata) {
-
-  std::vector<uint32_t> reqdWorkGroupSizeMD;
-  reqdWorkGroupSizeMD.reserve(5);
-  // 64-bit representing bit size
-  reqdWorkGroupSizeMD.push_back(96);
-  reqdWorkGroupSizeMD.push_back(0);
-  // reqd_work_group_size x
-  reqdWorkGroupSizeMD.push_back(8);
-  // reqd_work_group_size y
-  reqdWorkGroupSizeMD.push_back(16);
-  // reqd_work_group_size z
-  reqdWorkGroupSizeMD.push_back(32);
-
-  const char *reqdWorkGroupSizeMDConstName =
-      "_Z8myKernelPi@reqd_work_group_size";
-  std::vector<char> reqdWorkGroupSizeMDName(
-      reqdWorkGroupSizeMDConstName,
-      reqdWorkGroupSizeMDConstName + strlen(reqdWorkGroupSizeMDConstName) + 1);
-  _pi_device_binary_property_struct reqdWorkGroupSizeMDProp = {
-      reqdWorkGroupSizeMDName.data(), reqdWorkGroupSizeMD.data(),
-      pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY,
-      sizeof(std::uint64_t) + sizeof(std::uint32_t) * 3};
-  pi_device_binary_property reqdWorkGroupSizeMDPropPointer =
-      &reqdWorkGroupSizeMDProp;
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 1,
-          &reqdWorkGroupSizeMDPropPointer, &binary_status, &prog)),
-      PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-  ASSERT_NE(kern, nullptr);
-
-  size_t compileWGSize[3] = {0};
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelGetGroupInfo>(
-                kern, device_, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
-                sizeof(size_t) * 3, compileWGSize, nullptr)),
-            PI_SUCCESS);
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_EQ(compileWGSize[i], reqdWorkGroupSizeMD[i + 2]);
-  }
-}
-
-TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
-
-  pi_program prog;
-  /// NOTE: `binary_status` currently unsused in the CUDA backend but in case we
-  /// use it at some point in the future, pass it anyway and check the result.
-  /// Same goes for all the other tests in this file.
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-
-  int number = 10;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 0, sizeof(int), &number)),
-            PI_SUCCESS);
-  const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
-  int storedValue = *(static_cast<const int *>(kernArgs[0]));
-  ASSERT_EQ(storedValue, number);
-}
-
-TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-
-  int number = 10;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 0, sizeof(int), &number)),
-            PI_SUCCESS);
-  const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
-  int storedValue = *(static_cast<const int *>(kernArgs[0]));
-  ASSERT_EQ(storedValue, number);
-
-  int otherNumber = 934;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 0, sizeof(int), &otherNumber)),
-            PI_SUCCESS);
-  const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
-  storedValue = *(static_cast<const int *>(kernArgs2[0]));
-  ASSERT_EQ(storedValue, otherNumber);
-}
-
-TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-
-  size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 0, sizeof(pi_mem), &memObj)),
-            PI_SUCCESS);
-  const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
-  pi_mem storedValue = *(static_cast<pi_mem *>(kernArgs[0]));
-  ASSERT_EQ(storedValue, memObj);
-}
-
-TEST_F(CudaKernelsTest, PIkerneldispatch) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "_Z8myKernelPi", &kern)),
-            PI_SUCCESS);
-
-  size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piextKernelSetArgMemObj>(
-                kern, 0, nullptr, &memObj)),
-            PI_SUCCESS);
-
-  size_t workDim = 1;
-  size_t globalWorkOffset[] = {0};
-  size_t globalWorkSize[] = {1};
-  size_t localWorkSize[] = {1};
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueKernelLaunch>(
-                queue_, kern, workDim, globalWorkOffset, globalWorkSize,
-                localWorkSize, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-          context_, 1, &device_, nullptr, (const unsigned char **)&twoParams, 0,
-          nullptr, &binary_status, &prog)),
-      PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "twoParamKernel", &kern)),
-            PI_SUCCESS);
-
-  size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-
-  pi_mem memObj2;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj2,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piextKernelSetArgMemObj>(
-                kern, 0, nullptr, &memObj)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piextKernelSetArgMemObj>(
-                kern, 1, nullptr, &memObj2)),
-            PI_SUCCESS);
-
-  size_t workDim = 1;
-  size_t globalWorkOffset[] = {0};
-  size_t globalWorkSize[] = {1};
-  size_t localWorkSize[] = {1};
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueKernelLaunch>(
-                queue_, kern, workDim, globalWorkOffset, globalWorkSize,
-                localWorkSize, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj2)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
-
-  pi_program prog;
-  pi_int32 binary_status = PI_SUCCESS;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramCreateWithBinary>(
-                context_, 1, &device_, nullptr,
-                (const unsigned char **)&threeParamsTwoLocal, 0, nullptr,
-                &binary_status, &prog)),
-            PI_SUCCESS);
-  ASSERT_EQ(binary_status, PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piProgramBuild>(
-                prog, 1, &device_, "", nullptr, nullptr)),
-            PI_SUCCESS);
-
-  pi_kernel kern;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelCreate>(
-                prog, "twoParamKernelLocal", &kern)),
-            PI_SUCCESS);
-
-  int number = 10;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 0, sizeof(int), &number)),
-            PI_SUCCESS);
-  const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
-  int storedValue = *(static_cast<const int *>(kernArgs[0]));
-  ASSERT_EQ(storedValue, number);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 1, sizeof(int), nullptr)),
-            PI_SUCCESS);
-  const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)2 + NUM_IMPLICIT_ARGS);
-  storedValue = *(static_cast<const int *>(kernArgs2[1]));
-  ASSERT_EQ(storedValue, 0);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
-                kern, 2, sizeof(int), nullptr)),
-            PI_SUCCESS);
-  const auto &kernArgs3 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs3.size(), (size_t)3 + NUM_IMPLICIT_ARGS);
-  storedValue = *(static_cast<const int *>(kernArgs3[2]));
-  ASSERT_EQ(storedValue, static_cast<int>(sizeof(int)));
-}
diff --git a/sycl/unittests/pi/cuda/test_mem_obj.cpp b/sycl/unittests/pi/cuda/test_mem_obj.cpp
deleted file mode 100644
index b0693ff30830c..0000000000000
--- a/sycl/unittests/pi/cuda/test_mem_obj.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//==---- test_mem_obj.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "CudaUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <sycl/detail/cuda_definitions.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct CudaTestMemObj : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi::clearCudaContext();
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  CudaTestMemObj() = default;
-
-  ~CudaTestMemObj() = default;
-};
-
-TEST_F(CudaTestMemObj, piMemBufferCreateSimple) {
-  const size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaTestMemObj, piMemBufferAllocHost) {
-  const size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaTestMemObj, piMemBufferCreateNoActiveContext) {
-  const size_t memSize = 1024u;
-  // Context has been destroyed
-
-  CUcontext current = nullptr;
-
-  // pop CUDA contexts until there is not a cuda context bound to the thread
-  do {
-    CUcontext oldContext = nullptr;
-    auto cuErr = cuCtxPopCurrent(&oldContext);
-    EXPECT_EQ(cuErr, CUDA_SUCCESS);
-
-    // There should not be any active CUDA context
-    cuErr = cuCtxGetCurrent(&current);
-    ASSERT_EQ(cuErr, CUDA_SUCCESS);
-  } while (current != nullptr);
-
-  // The context object is passed, even if its not active it should be used
-  // to allocate the memory object
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-  ASSERT_NE(memObj, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaTestMemObj, piMemBufferPinnedMappedRead) {
-  const size_t memSize = sizeof(int);
-  const int value = 20;
-
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context_, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-  ASSERT_EQ(queue->get_context(), context_);
-
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-          queue, memObj, true, 0, sizeof(int), &value, 0, nullptr, nullptr)),
-      PI_SUCCESS);
-
-  int *host_ptr = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferMap>(
-                queue, memObj, true, PI_MAP_READ, 0, sizeof(int), 0, nullptr,
-                nullptr, (void **)&host_ptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(*host_ptr, value);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemUnmap>(
-                queue, memObj, host_ptr, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-}
-
-TEST_F(CudaTestMemObj, piMemBufferPinnedMappedWrite) {
-  const size_t memSize = sizeof(int);
-  const int value = 30;
-
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context_, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-  ASSERT_EQ(queue->get_context(), context_);
-
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  int *host_ptr = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferMap>(
-                queue, memObj, true, PI_MAP_WRITE, 0, sizeof(int), 0, nullptr,
-                nullptr, (void **)&host_ptr)),
-            PI_SUCCESS);
-
-  *host_ptr = value;
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemUnmap>(
-                queue, memObj, host_ptr, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  int read_value = 0;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue, memObj, true, 0, sizeof(int), &read_value, 0, nullptr,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(read_value, value);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-}
diff --git a/sycl/unittests/pi/cuda/test_primary_context.cpp b/sycl/unittests/pi/cuda/test_primary_context.cpp
deleted file mode 100644
index f9ce627d126ad..0000000000000
--- a/sycl/unittests/pi/cuda/test_primary_context.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-//==---------- pi_primary_context.cpp - PI unit tests ----------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlatforms.hpp"
-#include <pi_cuda.hpp>
-#include <sycl/sycl.hpp>
-
-#include <iostream>
-
-using namespace sycl;
-
-struct CudaPrimaryContextTests : public ::testing::TestWithParam<platform> {
-
-protected:
-  device deviceA_;
-  device deviceB_;
-
-  void SetUp() override {
-    std::vector<device> CudaDevices = GetParam().get_devices();
-
-    deviceA_ = CudaDevices[0];
-    deviceB_ = CudaDevices.size() > 1 ? CudaDevices[1] : deviceA_;
-  }
-
-  void TearDown() override {}
-};
-
-TEST_P(CudaPrimaryContextTests, piSingleContext) {
-  std::cout << "create single context" << std::endl;
-  context Context(
-      deviceA_, async_handler{},
-      {sycl::ext::oneapi::cuda::property::context::use_primary_context{}});
-
-  CUdevice CudaDevice = get_native<backend::ext_oneapi_cuda>(deviceA_);
-  CUcontext CudaContext = get_native<backend::ext_oneapi_cuda>(Context);
-
-  CUcontext PrimaryCudaContext;
-  cuDevicePrimaryCtxRetain(&PrimaryCudaContext, CudaDevice);
-
-  ASSERT_EQ(CudaContext, PrimaryCudaContext);
-
-  cuDevicePrimaryCtxRelease(CudaDevice);
-}
-
-TEST_P(CudaPrimaryContextTests, piMultiContextSingleDevice) {
-  std::cout << "create multiple contexts for one device" << std::endl;
-  context ContextA(
-      deviceA_, async_handler{},
-      {sycl::ext::oneapi::cuda::property::context::use_primary_context{}});
-  context ContextB(
-      deviceA_, async_handler{},
-      {sycl::ext::oneapi::cuda::property::context::use_primary_context{}});
-
-  CUcontext CudaContextA = get_native<backend::ext_oneapi_cuda>(ContextA);
-  CUcontext CudaContextB = get_native<backend::ext_oneapi_cuda>(ContextB);
-
-  ASSERT_EQ(CudaContextA, CudaContextB);
-}
-
-TEST_P(CudaPrimaryContextTests, piMultiContextMultiDevice) {
-  if (deviceA_ == deviceB_)
-    return;
-
-  CUdevice CudaDeviceA = get_native<backend::ext_oneapi_cuda>(deviceA_);
-  CUdevice CudaDeviceB = get_native<backend::ext_oneapi_cuda>(deviceB_);
-
-  ASSERT_NE(CudaDeviceA, CudaDeviceB);
-
-  std::cout << "create multiple contexts for multiple devices" << std::endl;
-  context ContextA(
-      deviceA_, async_handler{},
-      {sycl::ext::oneapi::cuda::property::context::use_primary_context{}});
-  context ContextB(
-      deviceB_, async_handler{},
-      {sycl::ext::oneapi::cuda::property::context::use_primary_context{}});
-
-  CUcontext CudaContextA = get_native<backend::ext_oneapi_cuda>(ContextA);
-  CUcontext CudaContextB = get_native<backend::ext_oneapi_cuda>(ContextB);
-
-  ASSERT_NE(CudaContextA, CudaContextB);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    OnCudaPlatform, CudaPrimaryContextTests,
-    ::testing::ValuesIn(pi::getPlatformsWithName("CUDA BACKEND")));
diff --git a/sycl/unittests/pi/cuda/test_sampler_properties.cpp b/sycl/unittests/pi/cuda/test_sampler_properties.cpp
deleted file mode 100644
index 793703d2bd1ca..0000000000000
--- a/sycl/unittests/pi/cuda/test_sampler_properties.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//==---- PlatformTest.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-#include <vector>
-
-namespace {
-
-using namespace sycl;
-
-class SamplerPropertiesTest
-    : public ::testing::TestWithParam<std::tuple<
-          pi_bool, pi_sampler_filter_mode, pi_sampler_addressing_mode>> {
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_cuda);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_sampler sampler_;
-
-  pi_bool normalizedCoords_;
-  pi_sampler_filter_mode filterMode_;
-  pi_sampler_addressing_mode addressMode_;
-
-  SamplerPropertiesTest() = default;
-
-  ~SamplerPropertiesTest() override = default;
-
-  void SetUp() override {
-    // skip the tests if the CUDA backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    std::tie(normalizedCoords_, filterMode_, addressMode_) = GetParam();
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_cuda), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-
-    pi_sampler_properties sampler_properties[] = {
-        PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS,
-        static_cast<pi_sampler_properties>(normalizedCoords_),
-        PI_SAMPLER_PROPERTIES_ADDRESSING_MODE,
-        static_cast<pi_sampler_properties>(addressMode_),
-        PI_SAMPLER_PROPERTIES_FILTER_MODE,
-        static_cast<pi_sampler_properties>(filterMode_),
-        0};
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piSamplerCreate>(
-                  context_, sampler_properties, &sampler_)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piSamplerRelease>(sampler_);
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-};
-
-TEST_P(SamplerPropertiesTest, piCheckNormalizedCoords) {
-  pi_bool actualNormalizedCoords = !normalizedCoords_;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_NORMALIZED_COORDS, sizeof(pi_bool),
-      &actualNormalizedCoords, nullptr);
-
-  ASSERT_EQ(actualNormalizedCoords, normalizedCoords_);
-}
-
-TEST_P(SamplerPropertiesTest, piCheckFilterMode) {
-  pi_sampler_filter_mode actualFilterMode;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_FILTER_MODE, sizeof(pi_sampler_filter_mode),
-      &actualFilterMode, nullptr);
-
-  ASSERT_EQ(actualFilterMode, filterMode_);
-}
-
-TEST_P(SamplerPropertiesTest, piCheckAddressingMode) {
-  pi_sampler_addressing_mode actualAddressMode;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_ADDRESSING_MODE,
-      sizeof(pi_sampler_addressing_mode), &actualAddressMode, nullptr);
-
-  ASSERT_EQ(actualAddressMode, addressMode_);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    SamplerPropertiesTestImpl, SamplerPropertiesTest,
-    ::testing::Combine(
-        ::testing::Values(PI_TRUE, PI_FALSE),
-        ::testing::Values(PI_SAMPLER_FILTER_MODE_LINEAR,
-                          PI_SAMPLER_FILTER_MODE_NEAREST),
-        ::testing::Values(PI_SAMPLER_ADDRESSING_MODE_CLAMP,
-                          PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE,
-                          PI_SAMPLER_ADDRESSING_MODE_NONE,
-                          PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT,
-                          PI_SAMPLER_ADDRESSING_MODE_REPEAT)));
-} // namespace
diff --git a/sycl/unittests/pi/hip/CMakeLists.txt b/sycl/unittests/pi/hip/CMakeLists.txt
deleted file mode 100644
index eee75b0447551..0000000000000
--- a/sycl/unittests/pi/hip/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-add_sycl_unittest(PiHipTests OBJECT
-  test_base_objects.cpp
-  test_commands.cpp
-  test_contexts.cpp
-  test_device.cpp
-  test_interop_get_native.cpp
-  test_kernels.cpp
-  test_mem_obj.cpp
-  test_primary_context.cpp
-  test_sampler_properties.cpp
-)
-
-add_dependencies(PiHipTests sycl)
-
-target_compile_definitions(PiHipTests
-  PRIVATE
-    GTEST_HAS_COMBINE=1)
-
-target_include_directories(PiHipTests
-  PRIVATE
-    "../"
-    "${sycl_inc_dir}/sycl/detail/"
-    "${sycl_inc_dir}"
-    "${sycl_plugin_dir}/hip/"
-    "${sycl_plugin_dir}/unified_runtime/"
-)
-
-if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD")
-  # Set HIP define to select AMD platform
-  target_compile_definitions(PiHipTests PRIVATE __HIP_PLATFORM_AMD__)
-elseif("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "NVIDIA")
-  # Set HIP define to select NVIDIA platform
-  target_compile_definitions(PiHipTests PRIVATE __HIP_PLATFORM_NVIDIA__)
-else()
-  message(FATAL_ERROR "Unspecified PI HIP platform, please set SYCL_BUILD_PI_HIP_PLATFORM to 'AMD' or 'NVIDIA'")
-endif()
-
-target_link_libraries(PiHipTests
-  PRIVATE
-    rocmdrv
-    UnifiedRuntime-Headers
-)
diff --git a/sycl/unittests/pi/hip/HipUtils.hpp b/sycl/unittests/pi/hip/HipUtils.hpp
deleted file mode 100644
index c62bf7dfefb08..0000000000000
--- a/sycl/unittests/pi/hip/HipUtils.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#pragma once
-
-#include <hip/hip_runtime.h>
-
-namespace pi {
-
-// utility function to clear the HIP context stack
-inline void clearHipContext() {
-  hipCtx_t ctxt = nullptr;
-  do {
-    hipCtxSetCurrent(nullptr);
-    hipCtxGetCurrent(&ctxt);
-  } while (ctxt != nullptr);
-}
-
-} // namespace pi
diff --git a/sycl/unittests/pi/hip/test_base_objects.cpp b/sycl/unittests/pi/hip/test_base_objects.cpp
deleted file mode 100644
index 86458aa62c0b1..0000000000000
--- a/sycl/unittests/pi/hip/test_base_objects.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//==---- test_base_objects.cpp --- PI unit tests ---------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-#include <thread>
-
-// https://sep5.readthedocs.io/en/latest/ROCm_API_References/
-// HIP_API/Context-Management.html#_CPPv419hipCtxGetApiVersion8hipCtx_tPi
-const int HIP_DRIVER_API_VERSION = 4;
-
-using namespace sycl;
-
-class HipBaseObjectsTest : public ::testing::Test {
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-  }
-
-  HipBaseObjectsTest() = default;
-
-  ~HipBaseObjectsTest() = default;
-};
-
-TEST_F(HipBaseObjectsTest, piContextCreate) {
-  pi_uint32 numPlatforms = 0;
-  pi_platform platform = nullptr;
-  pi_device device;
-  ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                0, nullptr, &numPlatforms)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                numPlatforms, &platform, nullptr)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_GE(numPlatforms, 1u);
-  ASSERT_NE(platform, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
-            PI_SUCCESS)
-      << "piDevicesGet failed.\n";
-
-  pi_context ctxt = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device, nullptr, nullptr, &ctxt)),
-            PI_SUCCESS)
-      << "piContextCreate failed.\n";
-
-  EXPECT_NE(ctxt, nullptr);
-  EXPECT_EQ(ctxt->get_device(), device);
-
-  // Retrieve the hipCtxt to check information is correct
-  hipCtx_t hipContext = ctxt->get();
-  int version = 0;
-  auto hipErr = hipCtxGetApiVersion(hipContext, &version);
-  EXPECT_EQ(hipErr, PI_SUCCESS);
-  EXPECT_EQ(version, HIP_DRIVER_API_VERSION);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
-            PI_SUCCESS);
-}
-
-TEST_F(HipBaseObjectsTest, piContextCreateChildThread) {
-  pi_uint32 numPlatforms = 0;
-  pi_platform platform;
-  pi_device device;
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                0, nullptr, &numPlatforms)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                numPlatforms, &platform, nullptr)),
-            PI_SUCCESS)
-      << "piPlatformsGet failed.\n";
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
-            PI_SUCCESS);
-
-  pi_context ctxt;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device, nullptr, nullptr, &ctxt)),
-            PI_SUCCESS);
-  EXPECT_NE(ctxt, nullptr);
-
-  // Retrieve the cuCtxt to check information is correct
-  auto checkValue = [=]() {
-    hipCtx_t hipContext = ctxt->get();
-    int version = 0;
-    auto hipErr = hipCtxGetApiVersion(hipContext, &version);
-    EXPECT_EQ(hipErr, PI_SUCCESS);
-    EXPECT_EQ(version, HIP_DRIVER_API_VERSION);
-
-    // The current context is different from the current thread
-    hipCtx_t current;
-    hipErr = hipCtxGetCurrent(&current);
-    EXPECT_EQ(hipErr, PI_SUCCESS);
-    EXPECT_NE(hipContext, current);
-
-    // Set the context from PI API as the current one
-    hipErr = hipCtxPushCurrent(hipContext);
-    EXPECT_EQ(hipErr, PI_SUCCESS);
-
-    hipErr = hipCtxGetCurrent(&current);
-    EXPECT_EQ(hipErr, PI_SUCCESS);
-    EXPECT_EQ(hipContext, current);
-  };
-  auto callContextFromOtherThread = std::thread(checkValue);
-
-  callContextFromOtherThread.join();
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
-            PI_SUCCESS);
-}
diff --git a/sycl/unittests/pi/hip/test_commands.cpp b/sycl/unittests/pi/hip/test_commands.cpp
deleted file mode 100644
index d453e9b9da1ea..0000000000000
--- a/sycl/unittests/pi/hip/test_commands.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//==---- test_commands.cpp --- PI unit tests -------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "HipUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct HipCommandsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_queue queue_;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi::clearHipContext();
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    ASSERT_NE(context_, nullptr);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context_, device_, 0, &queue_)),
-              PI_SUCCESS);
-    ASSERT_NE(queue_, nullptr);
-    auto tmpCtxt = queue_->get_context();
-    ASSERT_EQ(tmpCtxt, context_);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piQueueRelease>(queue_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  HipCommandsTest() = default;
-
-  ~HipCommandsTest() = default;
-};
-
-TEST_F(HipCommandsTest, PIEnqueueReadBufferBlocking) {
-  constexpr const size_t memSize = 10u;
-  constexpr const size_t bytes = memSize * sizeof(int);
-  const int data[memSize] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[memSize] = {};
-
-  pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                queue_, memObj, true, 0, bytes, data, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue_, memObj, true, 0, bytes, output, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  bool isSame =
-      std::equal(std::begin(output), std::end(output), std::begin(data));
-  EXPECT_TRUE(isSame);
-  if (!isSame) {
-    std::for_each(std::begin(output), std::end(output),
-                  [](int &elem) { std::cout << elem << ","; });
-    std::cout << std::endl;
-  }
-}
-
-TEST_F(HipCommandsTest, PIEnqueueReadBufferNonBlocking) {
-  constexpr const size_t memSize = 10u;
-  constexpr const size_t bytes = memSize * sizeof(int);
-  const int data[memSize] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[memSize] = {};
-
-  pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
-
-  pi_event cpIn, cpOut;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                queue_, memObj, false, 0, bytes, data, 0, nullptr, &cpIn)),
-            PI_SUCCESS);
-  ASSERT_NE(cpIn, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue_, memObj, false, 0, bytes, output, 0, nullptr, &cpOut)),
-            PI_SUCCESS);
-  ASSERT_NE(cpOut, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEventsWait>(1, &cpOut)),
-            PI_SUCCESS);
-
-  bool isSame =
-      std::equal(std::begin(output), std::end(output), std::begin(data));
-  EXPECT_TRUE(isSame);
-  if (!isSame) {
-    std::for_each(std::begin(output), std::end(output),
-                  [](int &elem) { std::cout << elem << ","; });
-    std::cout << std::endl;
-  }
-}
diff --git a/sycl/unittests/pi/hip/test_contexts.cpp b/sycl/unittests/pi/hip/test_contexts.cpp
deleted file mode 100644
index fa20e101d00e2..0000000000000
--- a/sycl/unittests/pi/hip/test_contexts.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-//==---- test_contexts.cpp --- PI unit tests -------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-
-#include <hip/hip_runtime.h>
-
-#include "HipUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct HipContextsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  pi_platform platform_;
-  pi_device device_;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {}
-
-  HipContextsTest() = default;
-
-  ~HipContextsTest() = default;
-};
-
-TEST_F(HipContextsTest, ContextLifetime) {
-  // start with no active context
-  pi::clearHipContext();
-
-  // create a context
-  pi_context context;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context)),
-            PI_SUCCESS);
-  ASSERT_NE(context, nullptr);
-
-  // create a queue from the context, this should use the ScopedContext
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-
-  // ensure the queue has the correct context
-  ASSERT_EQ(context, queue->get_context());
-
-  // check that the context is now the active HIP context
-  hipCtx_t hipCtxt = nullptr;
-  hipCtxGetCurrent(&hipCtxt);
-  ASSERT_EQ(hipCtxt, context->get());
-
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  plugin->call<detail::PiApiKind::piContextRelease>(context);
-
-  // check that the context was cleaned up properly by the destructor
-  hipCtxGetCurrent(&hipCtxt);
-  ASSERT_EQ(hipCtxt, nullptr);
-}
-
-TEST_F(HipContextsTest, ContextLifetimeExisting) {
-  // start by setting up a HIP context on the thread
-  hipCtx_t original;
-  hipCtxCreate(&original, hipDeviceMapHost, device_->get());
-
-  // ensure the HIP context is active
-  hipCtx_t current = nullptr;
-  hipCtxGetCurrent(&current);
-  ASSERT_EQ(original, current);
-
-  // create a PI context
-  pi_context context;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context)),
-            PI_SUCCESS);
-  ASSERT_NE(context, nullptr);
-
-  // create a queue from the context, this should use the ScopedContext
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-
-  // ensure the queue has the correct context
-  ASSERT_EQ(context, queue->get_context());
-
-  // check that the context is now the active HIP context
-  hipCtxGetCurrent(&current);
-  ASSERT_EQ(current, context->get());
-
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  plugin->call<detail::PiApiKind::piContextRelease>(context);
-
-  // check that the context was cleaned up, the old context will be restored
-  // automatically by hipCtxDestroy in piContextRelease, as it was pushed on the
-  // stack bu hipCtxCreate
-  hipCtxGetCurrent(&current);
-  ASSERT_EQ(current, original);
-
-  // release original context
-  hipCtxDestroy(original);
-}
-
-// In some cases (for host_task), the SYCL runtime may call PI API functions
-// from threads of the thread pool, this can cause issues because with the HIP
-// plugin these functions will set an active HIP context on these threads, but
-// never clean it up, as it will only get cleaned up in the main thread.
-//
-// So the following test aims to reproduce the scenario where there is a
-// dangling deleted context in a separate thread and seeing if the PI calls are
-// still able to work correctly in that thread.
-TEST_F(HipContextsTest, ContextThread) {
-  // start with no active context
-  pi::clearHipContext();
-
-  // create two PI contexts
-  pi_context context1;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context1)),
-            PI_SUCCESS);
-  ASSERT_NE(context1, nullptr);
-
-  pi_context context2;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context2)),
-            PI_SUCCESS);
-  ASSERT_NE(context2, nullptr);
-
-  // setup synchronization variables between the main thread and the testing
-  // thread
-  std::mutex m;
-  std::condition_variable cv;
-  bool released = false;
-  bool thread_done = false;
-
-  // create a testing thread that will create a queue with the first context,
-  // release the queue, then wait for the main thread to release the first
-  // context, and then create and release another queue with the second context
-  // this time
-  auto test_thread = std::thread([&] {
-    hipCtx_t current = nullptr;
-
-    // create a queue with the first context
-    pi_queue queue;
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context1, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context1, queue->get_context());
-
-    // check that the first context is now the active HIP context
-    hipCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-
-    // mark the first set of processing as done and notify the main thread
-    std::unique_lock<std::mutex> lock(m);
-    thread_done = true;
-    lock.unlock();
-    cv.notify_one();
-
-    // wait for the main thread to release the first context
-    lock.lock();
-    cv.wait(lock, [&] { return released; });
-
-    // check that the first context is still active, this is because deleting a
-    // context only cleans up the current thread
-    hipCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    // create a queue with the second context
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context2, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context2, queue->get_context());
-
-    // check that the second context is now the active HIP context
-    hipCtxGetCurrent(&current);
-    ASSERT_EQ(current, context2->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  });
-
-  // wait for the thread to be done with the first queue to release the first
-  // context
-  std::unique_lock<std::mutex> lock(m);
-  cv.wait(lock, [&] { return thread_done; });
-  plugin->call<detail::PiApiKind::piContextRelease>(context1);
-
-  // notify the other thread that the context was released
-  released = true;
-  lock.unlock();
-  cv.notify_one();
-
-  // wait for the thread to finish
-  test_thread.join();
-
-  plugin->call<detail::PiApiKind::piContextRelease>(context2);
-
-  // check that there is no context set on the main thread
-  hipCtx_t current = nullptr;
-  hipCtxGetCurrent(&current);
-  ASSERT_EQ(current, nullptr);
-}
diff --git a/sycl/unittests/pi/hip/test_device.cpp b/sycl/unittests/pi/hip/test_device.cpp
deleted file mode 100644
index a2d61c6ee87fc..0000000000000
--- a/sycl/unittests/pi/hip/test_device.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-//==---- test_device.cpp --- PI unit tests ---------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct HipDeviceTests : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  HipDeviceTests() = default;
-  ~HipDeviceTests() = default;
-};
-
-TEST_F(HipDeviceTests, PIDeviceGetInfoSimple) {
-
-  size_t return_size = 0;
-  pi_device_type device_type;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_TYPE, sizeof(pi_device_type),
-                &device_type, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_device_type));
-  EXPECT_EQ(
-      device_type,
-      PI_DEVICE_TYPE_GPU); // backend pre-defined value, device must be a GPU
-
-  pi_device parent_device = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PARENT_DEVICE, sizeof(pi_device),
-                &parent_device, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_device));
-  EXPECT_EQ(parent_device,
-            nullptr); // backend pre-set value, device cannot have a parent
-
-  pi_platform platform = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PLATFORM, sizeof(pi_platform),
-                &platform, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(return_size, sizeof(pi_platform));
-  EXPECT_EQ(platform, platform_); // test fixture device was created from the
-                                  // test fixture platform
-
-  cl_device_partition_property device_partition_property = -1;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
-                device_, PI_DEVICE_INFO_PARTITION_TYPE,
-                sizeof(cl_device_partition_property),
-                &device_partition_property, &return_size)),
-            PI_SUCCESS);
-  EXPECT_EQ(device_partition_property,
-            0); // PI HIP backend will not support device partitioning, this
-                // function should just return 0.
-  EXPECT_EQ(return_size, sizeof(cl_device_partition_property));
-}
diff --git a/sycl/unittests/pi/hip/test_interop_get_native.cpp b/sycl/unittests/pi/hip/test_interop_get_native.cpp
deleted file mode 100644
index 39e357430036b..0000000000000
--- a/sycl/unittests/pi/hip/test_interop_get_native.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-//==------- test_interop_get_native.cpp - SYCL HIP get_native tests --------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include <sycl/sycl.hpp>
-
-#include <sycl/ext/oneapi/backend/hip.hpp>
-
-#include "TestGetPlatforms.hpp"
-
-#include <iostream>
-
-using namespace sycl;
-
-struct HipInteropGetNativeTests : public ::testing::TestWithParam<platform> {
-
-protected:
-  std::unique_ptr<queue> syclQueue_;
-  device syclDevice_;
-
-  void SetUp() override {
-    syclDevice_ = GetParam().get_devices()[0];
-    syclQueue_ = std::unique_ptr<queue>{new queue{syclDevice_}};
-  }
-
-  void TearDown() override { syclQueue_.reset(); }
-};
-
-TEST_P(HipInteropGetNativeTests, getNativeDevice) {
-  hipDevice_t hipDevice = get_native<backend::ext_oneapi_hip>(syclDevice_);
-  char hipDeviceName[2] = {0, 0};
-  hipError_t result = hipDeviceGetName(hipDeviceName, 2, hipDevice);
-  ASSERT_EQ(result, PI_SUCCESS);
-  ASSERT_NE(hipDeviceName[0], 0);
-}
-
-TEST_P(HipInteropGetNativeTests, getNativeContext) {
-  hipCtx_t hipContext =
-      get_native<backend::ext_oneapi_hip>(syclQueue_->get_context());
-  ASSERT_NE(hipContext, nullptr);
-}
-
-TEST_P(HipInteropGetNativeTests, interopTaskGetMem) {
-  buffer<int, 1> syclBuffer(range<1>{1});
-  syclQueue_->submit([&](handler &cgh) {
-    auto syclAccessor = syclBuffer.get_access<access::mode::read>(cgh);
-    cgh.host_task([=](interop_handle ih) {
-      hipDeviceptr_t hipPtr =
-          ih.get_native_mem<backend::ext_oneapi_hip>(syclAccessor);
-      hipDeviceptr_t hipPtrBase;
-      size_t hipPtrSize = 0;
-      hipCtx_t hipContext =
-          get_native<backend::ext_oneapi_hip>(syclQueue_->get_context());
-      ASSERT_EQ(PI_SUCCESS, hipCtxPushCurrent(hipContext));
-      ASSERT_EQ(PI_SUCCESS,
-                hipMemGetAddressRange(&hipPtrBase, &hipPtrSize, hipPtr));
-      ASSERT_EQ(PI_SUCCESS, hipCtxPopCurrent(nullptr));
-      ASSERT_EQ(sizeof(int), hipPtrSize);
-    });
-  });
-}
-
-TEST_P(HipInteropGetNativeTests, interopTaskGetQueue) {
-  hipStream_t hipStream = get_native<backend::ext_oneapi_hip>(*syclQueue_);
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      hipStream_t hipInteropStream =
-          ih.get_native_queue<backend::ext_oneapi_hip>();
-      ASSERT_EQ(hipInteropStream, hipStream);
-    });
-  });
-}
-
-TEST_P(HipInteropGetNativeTests, hostTaskGetNativeMem) {
-  buffer<int, 1> syclBuffer(range<1>{1});
-  syclQueue_->submit([&](handler &cgh) {
-    auto syclAccessor = syclBuffer.get_access<access::mode::read>(cgh);
-    cgh.host_task([=](interop_handle ih) {
-      hipDeviceptr_t hipPtr =
-          ih.get_native_mem<backend::ext_oneapi_hip>(syclAccessor);
-      hipDeviceptr_t hipPtrBase;
-      size_t hipPtrSize = 0;
-      hipCtx_t hipContext =
-          get_native<backend::ext_oneapi_hip>(syclQueue_->get_context());
-      ASSERT_EQ(PI_SUCCESS, hipCtxPushCurrent(hipContext));
-      ASSERT_EQ(PI_SUCCESS,
-                hipMemGetAddressRange(&hipPtrBase, &hipPtrSize, hipPtr));
-      ASSERT_EQ(PI_SUCCESS, hipCtxPopCurrent(nullptr));
-      ASSERT_EQ(sizeof(int), hipPtrSize);
-    });
-  });
-}
-
-TEST_P(HipInteropGetNativeTests, hostTaskGetNativeQueue) {
-  hipStream_t hipStream = get_native<backend::ext_oneapi_hip>(*syclQueue_);
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      hipStream_t hipInteropStream =
-          ih.get_native_queue<backend::ext_oneapi_hip>();
-      ASSERT_EQ(hipInteropStream, hipStream);
-    });
-  });
-}
-
-TEST_P(HipInteropGetNativeTests, hostTaskGetNativeContext) {
-  hipCtx_t hipContext =
-      get_native<backend::ext_oneapi_hip>(syclQueue_->get_context());
-  syclQueue_->submit([&](handler &cgh) {
-    cgh.host_task([=](interop_handle ih) {
-      hipCtx_t hipInteropContext =
-          ih.get_native_context<backend::ext_oneapi_hip>();
-      ASSERT_EQ(hipInteropContext, hipContext);
-    });
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    OnHipPlatform, HipInteropGetNativeTests,
-    ::testing::ValuesIn(pi::getPlatformsWithName("HIP BACKEND")));
diff --git a/sycl/unittests/pi/hip/test_kernels.cpp b/sycl/unittests/pi/hip/test_kernels.cpp
deleted file mode 100644
index 51463a5e23c29..0000000000000
--- a/sycl/unittests/pi/hip/test_kernels.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//==---- test_kernels.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-// PI HIP kernels carry an additional argument for the implicit global offset.
-#define NUM_IMPLICIT_ARGS 1
-
-using namespace sycl;
-
-struct HipKernelsTest : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_queue queue_;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    ASSERT_NE(context_, nullptr);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context_, device_, 0, &queue_)),
-              PI_SUCCESS);
-    ASSERT_NE(queue_, nullptr);
-    ASSERT_EQ(queue_->get_context(), context_);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piQueueRelease>(queue_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  HipKernelsTest() = default;
-
-  ~HipKernelsTest() = default;
-};
diff --git a/sycl/unittests/pi/hip/test_mem_obj.cpp b/sycl/unittests/pi/hip/test_mem_obj.cpp
deleted file mode 100644
index 382a510bf97d9..0000000000000
--- a/sycl/unittests/pi/hip/test_mem_obj.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//==---- test_mem_obj.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "HipUtils.hpp"
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <pi_hip.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-struct HipTestMemObj : public ::testing::Test {
-
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    pi::clearHipContext();
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-
-  HipTestMemObj() = default;
-
-  ~HipTestMemObj() = default;
-};
-
-TEST_F(HipTestMemObj, piMemBufferCreateSimple) {
-  const size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(HipTestMemObj, piMemBufferAllocHost) {
-  const size_t memSize = 1024u;
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(HipTestMemObj, piMemBufferCreateNoActiveContext) {
-  const size_t memSize = 1024u;
-  // Context has been destroyed
-
-  hipCtx_t current = nullptr;
-
-  // pop HIP contexts until there is not a HIP context bound to the thread
-  do {
-    hipCtx_t oldContext = nullptr;
-    auto hipErr = hipCtxPopCurrent(&oldContext);
-    EXPECT_EQ(hipErr, PI_SUCCESS);
-
-    // There should not be any active HIP context
-    hipErr = hipCtxGetCurrent(&current);
-    ASSERT_EQ(hipErr, PI_SUCCESS);
-  } while (current != nullptr);
-
-  // The context object is passed, even if its not active it should be used
-  // to allocate the memory object
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
-            PI_SUCCESS);
-  ASSERT_NE(memObj, nullptr);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-}
-
-TEST_F(HipTestMemObj, piMemBufferPinnedMappedRead) {
-  const size_t memSize = sizeof(int);
-  const int value = 20;
-
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context_, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-  ASSERT_EQ(queue->get_context(), context_);
-
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-          queue, memObj, true, 0, sizeof(int), &value, 0, nullptr, nullptr)),
-      PI_SUCCESS);
-
-  int *host_ptr = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferMap>(
-                queue, memObj, true, PI_MAP_READ, 0, sizeof(int), 0, nullptr,
-                nullptr, (void **)&host_ptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(*host_ptr, value);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemUnmap>(
-                queue, memObj, host_ptr, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-}
-
-TEST_F(HipTestMemObj, piMemBufferPinnedMappedWrite) {
-  const size_t memSize = sizeof(int);
-  const int value = 30;
-
-  pi_queue queue;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                context_, device_, 0, &queue)),
-            PI_SUCCESS);
-  ASSERT_NE(queue, nullptr);
-  ASSERT_EQ(queue->get_context(), context_);
-
-  pi_mem memObj;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
-            PI_SUCCESS);
-
-  int *host_ptr = nullptr;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferMap>(
-                queue, memObj, true, PI_MAP_WRITE, 0, sizeof(int), 0, nullptr,
-                nullptr, (void **)&host_ptr)),
-            PI_SUCCESS);
-
-  *host_ptr = value;
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemUnmap>(
-                queue, memObj, host_ptr, 0, nullptr, nullptr)),
-            PI_SUCCESS);
-
-  int read_value = 0;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-                queue, memObj, true, 0, sizeof(int), &read_value, 0, nullptr,
-                nullptr)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(read_value, value);
-
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
-            PI_SUCCESS);
-  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-}
diff --git a/sycl/unittests/pi/hip/test_primary_context.cpp b/sycl/unittests/pi/hip/test_primary_context.cpp
deleted file mode 100644
index 1ed80159d32d9..0000000000000
--- a/sycl/unittests/pi/hip/test_primary_context.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//==---------- test_primary_context.cpp - PI unit tests --------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <hip/hip_runtime.h>
-
-#include "TestGetPlatforms.hpp"
-#include <pi_hip.hpp>
-#include <sycl/ext/oneapi/backend/hip.hpp>
-#include <sycl/sycl.hpp>
-
-#include <iostream>
-
-using namespace sycl;
-
-struct HipPrimaryContextTests : public ::testing::TestWithParam<platform> {
-
-protected:
-  device deviceA_;
-  device deviceB_;
-
-  void SetUp() override {
-    std::vector<device> HipDevices = GetParam().get_devices();
-
-    deviceA_ = HipDevices[0];
-    deviceB_ = HipDevices.size() > 1 ? HipDevices[1] : deviceA_;
-  }
-
-  void TearDown() override {}
-};
-
-TEST_P(HipPrimaryContextTests, piSingleContext) {
-  std::cout << "create single context" << std::endl;
-  context Context(deviceA_, async_handler{});
-
-  hipDevice_t HipDevice = get_native<backend::ext_oneapi_hip>(deviceA_);
-  hipCtx_t HipContext = get_native<backend::ext_oneapi_hip>(Context);
-
-  hipCtx_t PrimaryHipContext;
-  hipDevicePrimaryCtxRetain(&PrimaryHipContext, HipDevice);
-
-  ASSERT_EQ(HipContext, PrimaryHipContext);
-
-  hipDevicePrimaryCtxRelease(HipDevice);
-}
-
-TEST_P(HipPrimaryContextTests, piMultiContextSingleDevice) {
-  std::cout << "create multiple contexts for one device" << std::endl;
-  context ContextA(deviceA_, async_handler{});
-  context ContextB(deviceA_, async_handler{});
-
-  hipCtx_t HipContextA = get_native<backend::ext_oneapi_hip>(ContextA);
-  hipCtx_t HipContextB = get_native<backend::ext_oneapi_hip>(ContextB);
-
-  ASSERT_EQ(HipContextA, HipContextB);
-}
-
-TEST_P(HipPrimaryContextTests, piMultiContextMultiDevice) {
-  if (deviceA_ == deviceB_)
-    return;
-
-  hipDevice_t HipDeviceA = get_native<backend::ext_oneapi_hip>(deviceA_);
-  hipDevice_t HipDeviceB = get_native<backend::ext_oneapi_hip>(deviceB_);
-
-  ASSERT_NE(HipDeviceA, HipDeviceB);
-
-  std::cout << "create multiple contexts for multiple devices" << std::endl;
-  context ContextA(deviceA_, async_handler{});
-  context ContextB(deviceB_, async_handler{});
-
-  hipCtx_t HipContextA = get_native<backend::ext_oneapi_hip>(ContextA);
-  hipCtx_t HipContextB = get_native<backend::ext_oneapi_hip>(ContextB);
-
-  ASSERT_NE(HipContextA, HipContextB);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    OnHipPlatform, HipPrimaryContextTests,
-    ::testing::ValuesIn(pi::getPlatformsWithName("HIP BACKEND")));
diff --git a/sycl/unittests/pi/hip/test_sampler_properties.cpp b/sycl/unittests/pi/hip/test_sampler_properties.cpp
deleted file mode 100644
index bed0bb9b053a4..0000000000000
--- a/sycl/unittests/pi/hip/test_sampler_properties.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//==---- test_sampler_properties.cpp --- PI unit tests ---------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include "TestGetPlugin.hpp"
-#include <detail/plugin.hpp>
-#include <sycl/detail/pi.hpp>
-#include <sycl/sycl.hpp>
-
-#include <vector>
-
-namespace {
-
-using namespace sycl;
-
-class SamplerPropertiesTest
-    : public ::testing::TestWithParam<std::tuple<
-          pi_bool, pi_sampler_filter_mode, pi_sampler_addressing_mode>> {
-protected:
-  std::optional<detail::PluginPtr> &plugin =
-      pi::initializeAndGet(backend::ext_oneapi_hip);
-
-  pi_platform platform_;
-  pi_device device_;
-  pi_context context_;
-  pi_sampler sampler_;
-
-  pi_bool normalizedCoords_;
-  pi_sampler_filter_mode filterMode_;
-  pi_sampler_addressing_mode addressMode_;
-
-  SamplerPropertiesTest() = default;
-
-  ~SamplerPropertiesTest() override = default;
-
-  void SetUp() override {
-    // skip the tests if the HIP backend is not available
-    if (!plugin.has_value()) {
-      GTEST_SKIP();
-    }
-
-    std::tie(normalizedCoords_, filterMode_, addressMode_) = GetParam();
-
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin->hasBackend(backend::ext_oneapi_hip), PI_SUCCESS);
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &platform_, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
-              PI_SUCCESS);
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &device_, nullptr, nullptr, &context_)),
-              PI_SUCCESS);
-    EXPECT_NE(context_, nullptr);
-
-    pi_sampler_properties sampler_properties[] = {
-        PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS,
-        static_cast<pi_sampler_properties>(normalizedCoords_),
-        PI_SAMPLER_PROPERTIES_ADDRESSING_MODE,
-        static_cast<pi_sampler_properties>(addressMode_),
-        PI_SAMPLER_PROPERTIES_FILTER_MODE,
-        static_cast<pi_sampler_properties>(filterMode_),
-        0};
-
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piSamplerCreate>(
-                  context_, sampler_properties, &sampler_)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {
-    if (plugin.has_value()) {
-      plugin->call<detail::PiApiKind::piSamplerRelease>(sampler_);
-      plugin->call<detail::PiApiKind::piDeviceRelease>(device_);
-      plugin->call<detail::PiApiKind::piContextRelease>(context_);
-    }
-  }
-};
-
-TEST_P(SamplerPropertiesTest, piCheckNormalizedCoords) {
-  pi_bool actualNormalizedCoords = !normalizedCoords_;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_NORMALIZED_COORDS, sizeof(pi_bool),
-      &actualNormalizedCoords, nullptr);
-
-  ASSERT_EQ(actualNormalizedCoords, normalizedCoords_);
-}
-
-TEST_P(SamplerPropertiesTest, piCheckFilterMode) {
-  pi_sampler_filter_mode actualFilterMode;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_FILTER_MODE, sizeof(pi_sampler_filter_mode),
-      &actualFilterMode, nullptr);
-
-  ASSERT_EQ(actualFilterMode, filterMode_);
-}
-
-TEST_P(SamplerPropertiesTest, piCheckAddressingMode) {
-  pi_sampler_addressing_mode actualAddressMode;
-
-  plugin->call<detail::PiApiKind::piSamplerGetInfo>(
-      sampler_, PI_SAMPLER_INFO_ADDRESSING_MODE,
-      sizeof(pi_sampler_addressing_mode), &actualAddressMode, nullptr);
-
-  ASSERT_EQ(actualAddressMode, addressMode_);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    SamplerPropertiesTestImpl, SamplerPropertiesTest,
-    ::testing::Combine(
-        ::testing::Values(PI_TRUE, PI_FALSE),
-        ::testing::Values(PI_SAMPLER_FILTER_MODE_LINEAR,
-                          PI_SAMPLER_FILTER_MODE_NEAREST),
-        ::testing::Values(PI_SAMPLER_ADDRESSING_MODE_CLAMP,
-                          PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE,
-                          PI_SAMPLER_ADDRESSING_MODE_NONE,
-                          PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT,
-                          PI_SAMPLER_ADDRESSING_MODE_REPEAT)));
-} // namespace

From ab86d0dbac549aaff2dda5dfb26bedb961832caa Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Mon, 5 Feb 2024 13:39:57 +0000
Subject: [PATCH 8/9] [SYCL][HIP] Add coarse-grained memory advice for HIP on
 AMD (#12394)

Enables and tests coarse grained memory access via the memadvise
implementation for HIP platforms on AMD hardware.

See related UR changes for the adapter implementation:
https://github.com/oneapi-src/unified-runtime/pull/1249

---------

Co-authored-by: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
Co-authored-by: aarongreig <aarongreig01@gmail.com>
---
 sycl/include/sycl/detail/pi.h               |   5 +-
 sycl/plugins/unified_runtime/CMakeLists.txt |  14 +-
 sycl/plugins/unified_runtime/pi2ur.hpp      |   6 +
 sycl/test-e2e/USM/memadvise_flags.cpp       |   2 +
 sycl/test-e2e/USM/memory_coherency_hip.cpp  | 147 ++++++++++++++++++++
 5 files changed, 166 insertions(+), 8 deletions(-)
 create mode 100644 sycl/test-e2e/USM/memory_coherency_hip.cpp

diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 31c6a86f3300d..0c02a7705b51d 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -151,9 +151,10 @@
 // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM
 // 15.43 Changed the signature of piextMemGetNativeHandle to also take a
 // pi_device
+// 15.44 Add coarse-grain memory advice flag for HIP.
 
 #define _PI_H_VERSION_MAJOR 15
-#define _PI_H_VERSION_MINOR 43
+#define _PI_H_VERSION_MINOR 44
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -582,6 +583,8 @@ typedef enum {
   PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
   PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
   PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
+  PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
+  PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
   PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
 } _pi_mem_advice;
 
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index def19a4151124..84571b19ef471 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
-  # commit 9363574db721d2388c7d76a10edb128764872352
-  # Merge: 553a6b82 5e513738
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  # commit cd97e172cbbfc411fccb0b80e0fff6f9126574f4
+  # Merge: bd745d10 2a9ded6f
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Thu Feb 1 11:50:16 2024 +0000
-  #     Merge pull request #1302 from kbenzie/benie/cl-binary-type-intermediate
-  #     [CL] Handle INTERMEDIATE binary type
-  set(UNIFIED_RUNTIME_TAG 9363574db721d2388c7d76a10edb128764872352)
+  # Date:   Fri Feb 2 14:24:16 2024 +0000
+  #     Merge pull request #1249 from GeorgeWeb/georgi/hip_memadvise_coarse_grained
+  #     [HIP] Implement coarse-grained memory advice for the HIP adapter
+  set(UNIFIED_RUNTIME_TAG cd97e172cbbfc411fccb0b80e0fff6f9126574f4)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index c19c93a6af53a..4f0c7a8a1f885 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3375,6 +3375,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
   if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
     UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
   }
+  if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;
+  }
+  if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;
+  }
   if (Advice & PI_MEM_ADVICE_RESET) {
     UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
   }
diff --git a/sycl/test-e2e/USM/memadvise_flags.cpp b/sycl/test-e2e/USM/memadvise_flags.cpp
index 542ee7a1ea7a2..d7c28a71b5ca4 100755
--- a/sycl/test-e2e/USM/memadvise_flags.cpp
+++ b/sycl/test-e2e/USM/memadvise_flags.cpp
@@ -59,6 +59,8 @@ int main() {
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_PREFERRED_LOCATION_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_ACCESSED_BY_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_ACCESSED_BY_HOST);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED);
   } else {
     // Skip
     return 0;
diff --git a/sycl/test-e2e/USM/memory_coherency_hip.cpp b/sycl/test-e2e/USM/memory_coherency_hip.cpp
new file mode 100644
index 0000000000000..dec0182812c70
--- /dev/null
+++ b/sycl/test-e2e/USM/memory_coherency_hip.cpp
@@ -0,0 +1,147 @@
+// RUN: %{build} -o %t1.out
+// REQUIRES: hip_amd
+// RUN: %{run} %t1.out
+
+//==---- memory_coherency_hip.cpp  -----------------------------------------==//
+// USM coarse/fine grain memory coherency test for the HIP-AMD backend.
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/sycl.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+namespace kernels {
+class SquareKrnl final {
+  int *mPtr;
+
+public:
+  SquareKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const { *mPtr = (*mPtr) * (*mPtr); }
+};
+
+class CoherencyTestKrnl final {
+  int *mPtr;
+
+public:
+  CoherencyTestKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const {
+    auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>(mPtr[0]);
+
+    // mPtr was initialized to 1 by the host, now set it to 2.
+    atm.fetch_add(1);
+
+    // spin until mPtr is 3, then change it to 4.
+    int expected{3};
+    int old = atm.load();
+    while (true) {
+      old = atm.load();
+      if (old == expected) {
+        if (atm.compare_exchange_strong(old, 4)) {
+          break;
+        }
+      }
+    }
+  }
+};
+} // namespace kernels
+
+int main() {
+  sycl::queue q{};
+  sycl::device dev = q.get_device();
+  sycl::context ctx = q.get_context();
+  if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
+    std::cout << "Shared USM is not supported. Skipping test.\n";
+    return 0;
+  }
+
+  bool coherent{false};
+
+  int *ptr = sycl::malloc_shared<int>(1, q);
+
+  // Coherency test 1
+  //
+  // The following test validates if memory access is fine with memory allocated
+  // using malloc_managed() and COARSE_GRAINED advice set via mem_advise().
+  //
+  // Coarse grained memory is only guaranteed to be coherent outside of GPU
+  // kernels that modify it. Changes applied to coarse-grained memory by a GPU
+  // kernel are only visible to the rest of the system (CPU or other GPUs) when
+  // the kernel has completed. A GPU kernel is only guaranteed to see changes
+  // applied to coarse grained memory by the rest of the system (CPU or other
+  // GPUs) if those changes were made before the kernel launched.
+
+  // Hint to use coarse-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED});
+
+  int init_val{9};
+  int expected{init_val * init_val};
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::SquareKrnl{ptr});
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent = true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Coherency test 2
+  //
+  // The following test validates if fine-grain behavior is observed or not with
+  // memory allocated using malloc_managed().
+  //
+  // Fine grained memory allows CPUs and GPUs to synchronize (via atomics) and
+  // coherently communicate with each other while the GPU kernel is running.
+
+  // Hint to use fine-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED});
+
+  init_val = 1;
+  expected = 4;
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::CoherencyTestKrnl{ptr});
+
+  // wait until ptr is 2 from the kernel (or 3 seconds), then increment to 3.
+  while (*ptr == 2) {
+    using std::chrono_literals::operator""s;
+    std::this_thread::sleep_for(3s);
+    break;
+  }
+  *ptr += 1;
+
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent &= true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Cleanup
+  sycl::free(ptr, q);
+
+  // Check if all coherency tests passed.
+  assert(coherent);
+  // The above assert won't trigger with NDEBUG, so ensure the right exit code.
+  return coherent ? 0 : 1;
+}

From f7bdae8d04d1881d265602134e8f73ed326cd702 Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Mon, 5 Feb 2024 22:54:18 +0800
Subject: [PATCH 9/9] [SYCL][Driver] Enhance -Xarch_device and -Xarch_host for
 sycl (#12478)

Add simple support to enhance -Xarch_device and -Xarch_host in sycl
offloading which can support multiple arguments in single '-Xarch_*' and
'-mllvm <opt>' is supported as well.

---------

Signed-off-by: jinge90 <ge.jin@intel.com>
---
 clang/lib/Driver/Compilation.cpp |  57 +++++++++++
 clang/test/Driver/sycl-xarch.cpp | 162 +++++++++++++++++++++++++++++++
 2 files changed, 219 insertions(+)
 create mode 100644 clang/test/Driver/sycl-xarch.cpp

diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index e3c6c2d2ee34f..6c728f316fc89 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -60,6 +60,61 @@ Compilation::~Compilation() {
       delete Arg.second;
 }
 
+static void HandleXarchArgs(DerivedArgList *OffloadArgList, const Driver &D,
+                            bool IsDevice) {
+  if (!OffloadArgList)
+    return;
+
+  if (IsDevice && !OffloadArgList->hasArg(options::OPT_Xarch_device))
+    return;
+
+  if (!IsDevice && !OffloadArgList->hasArg(options::OPT_Xarch_host))
+    return;
+
+  bool NeedHandle = false;
+  std::vector<std::string> XarchValues;
+  XarchValues = IsDevice
+                    ? OffloadArgList->getAllArgValues(options::OPT_Xarch_device)
+                    : OffloadArgList->getAllArgValues(options::OPT_Xarch_host);
+  SmallVector<StringRef, 8> XarchValueRefs;
+  for (auto XarchV : XarchValues) {
+    if (XarchV.find(' ') != std::string::npos) {
+      NeedHandle = true;
+      StringRef XarchVRef(XarchV);
+      SmallVector<StringRef, 8> XarchVecs;
+      XarchVRef.trim().split(XarchVecs, ' ', -1, false);
+      size_t Index;
+      const size_t XSize = XarchVecs.size();
+      for (Index = 0; Index < XSize; ++Index) {
+        if (XarchVecs[Index].compare("-mllvm") == 0) {
+          if (Index < (XSize - 1)) {
+            XarchValueRefs.push_back(OffloadArgList->MakeArgStringRef(
+                (StringRef("-mllvm=") + XarchVecs[Index + 1]).str()));
+            Index++;
+            continue;
+          } else
+            D.Diag(clang::diag::err_drv_missing_argument) << "-mllvm" << 1;
+        } else
+          XarchValueRefs.push_back(
+              OffloadArgList->MakeArgStringRef(XarchVecs[Index]));
+      }
+    } else
+      XarchValueRefs.push_back(StringRef(XarchV));
+  }
+
+  if (NeedHandle) {
+    auto Xarch_OPT =
+        IsDevice ? options::OPT_Xarch_device : options::OPT_Xarch_host;
+    OffloadArgList->eraseArg(Xarch_OPT);
+    for (auto XarchV : XarchValueRefs) {
+      Arg *A = OffloadArgList->MakeSeparateArg(
+          nullptr, D.getOpts().getOption(Xarch_OPT), XarchV);
+      A->claim();
+      OffloadArgList->append(A);
+    }
+  }
+}
+
 const DerivedArgList &
 Compilation::getArgsForToolChain(const ToolChain *TC, StringRef BoundArch,
                                  Action::OffloadKind DeviceOffloadKind) {
@@ -82,9 +137,11 @@ Compilation::getArgsForToolChain(const ToolChain *TC, StringRef BoundArch,
 
     DerivedArgList *NewDAL = nullptr;
     if (!OffloadArgs) {
+      HandleXarchArgs(TranslatedArgs, getDriver(), false);
       NewDAL = TC->TranslateXarchArgs(*TranslatedArgs, BoundArch,
                                       DeviceOffloadKind, &AllocatedArgs);
     } else {
+      HandleXarchArgs(OffloadArgs, getDriver(), true);
       NewDAL = TC->TranslateXarchArgs(*OffloadArgs, BoundArch, DeviceOffloadKind,
                                       &AllocatedArgs);
       if (!NewDAL)
diff --git a/clang/test/Driver/sycl-xarch.cpp b/clang/test/Driver/sycl-xarch.cpp
new file mode 100644
index 0000000000000..200a8de643170
--- /dev/null
+++ b/clang/test/Driver/sycl-xarch.cpp
@@ -0,0 +1,162 @@
+///
+/// Perform several driver tests for SYCL -Xarch_device/host on Linux
+///
+
+// UNSUPPORTED: system-windows
+
+/// ###########################################################################
+
+/// test behavior of -Xarch_device with 1 option for SYCL compiler, the flag
+/// should be passed to device compilation only.
+// RUN: %clangxx -fsycl %s -Xarch_device -fsanitize=address -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_OPTION
+// RUN: %clangxx -fsycl %s -Xarch_device -fsanitize=address -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_ONLY
+// SYCL_XARCH_DEVICE_OPTION: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_DEVICE_OPTION-SAME: -fsanitize=address
+// SYCL_XARCH_DEVICE_OPTION-SAME: -fsanitize-address-use-after-return=never
+// SYCL_XARCH_DEVICE_OPTION-SAME: -fno-sanitize-address-use-after-scope
+// SYCL_XARCH_DEVICE_OPTION-SAME: "-mllvm" "-asan-instrumentation-with-call-threshold=0"
+// SYCL_XARCH_DEVICE_OPTION-SAME: "-mllvm" "-asan-stack=0"
+// SYCL_XARCH_DEVICE_OPTION-SAME: "-mllvm" "-asan-globals=0"
+// SYCL_XARCH_DEVICE_ONLY: llc{{.*}} "-filetype=obj"
+// SYCL_XARCH_DEVICE_ONLY-NOT: fsanitize=address
+
+/// test behavior of -Xarch_device with multiple options for SYCL compiler, the
+/// flags should be passed to device compilation only.
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -DXARCH_DEVICE_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_OPTIONS1
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -DXARCH_DEVICE_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_OPTIONS1
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -DXARCH_DEVICE_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_OPTIONS2
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -DXARCH_DEVICE_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_DEVICE_OPTIONS3
+// SYCL_XARCH_DEVICE_OPTIONS1: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: -fsanitize=address
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: -fsanitize-address-use-after-return=never
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: -fno-sanitize-address-use-after-scope
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-instrumentation-with-call-threshold=0"
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-stack=0"
+// SYCL_XARCH_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-globals=0"
+// SYCL_XARCH_DEVICE_OPTIONS2: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_DEVICE_OPTIONS2-SAME: XARCH_DEVICE_TEST
+// SYCL_XARCH_DEVICE_OPTIONS3: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_DEVICE_OPTIONS3-SAME: "-mllvm" "-enable-merge-functions"
+
+
+/// test behavior of -Xarch_host with 1 option for SYCL compiler, the flag
+/// should be passed to host compilation only.
+// RUN: %clangxx -fsycl %s -Xarch_host -fsanitize=address -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_OPTION
+// RUN: %clangxx -fsycl %s -Xarch_host -fsanitize=address -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_ONLY
+// SYCL_XARCH_HOST_OPTION: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_OPTION-SAME: -fsanitize=address
+// SYCL_XARCH_HOST_OPTION-SAME: -fsanitize-address-use-after-scope
+// SYCL_XARCH_HOST_OPTION-NEXT: libclang_rt.asan
+// SYCL_XARCH_HOST_ONLY: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_HOST_ONLY-NOT: -fsanitize=address
+// SYCL_XARCH_HOST_ONLY: clang{{.*}} "-fsycl-is-host"
+
+/// test behavior of -Xarch_host with multiple options for SYCL compiler, the
+/// flags should be passed to host compilation only.
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -DXARCH_HOST_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_OPTIONS1
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -DXARCH_HOST_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_OPTIONS2
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -DXARCH_HOST_TEST -mllvm -enable-merge-functions" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_OPTIONS3
+// SYCL_XARCH_HOST_OPTIONS1: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_OPTIONS1-SAME: -fsanitize=address
+// SYCL_XARCH_HOST_OPTIONS1-SAME: -fsanitize-address-use-after-scope
+// SYCL_XARCH_HOST_OPTIONS2: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_OPTIONS2-SAME: XARCH_HOST_TEST
+// SYCL_XARCH_HOST_OPTIONS3: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_OPTIONS3-SAME: "-mllvm" "-enable-merge-functions"
+
+// test behavior of combination of -Xarch_device and -Xarch_device.
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_DEVICE_OPTIONS1
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_DEVICE_OPTIONS2
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_NO_DEVICE
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_HOST_OPTIONS1
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_HOST_OPTIONS2
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_HOST_OPTIONS3
+// RUN: %clangxx -fsycl %s -Xarch_device "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host "-fsanitize=memory -DUSE_XARCH_HOST -fno-builtin" -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_COM_NO_HOST
+// SYCL_XARCH_COM_DEVICE_OPTIONS1: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: -fsanitize=address
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: -fsanitize-address-use-after-return=never
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: -fno-sanitize-address-use-after-scope
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-instrumentation-with-call-threshold=0"
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-stack=0"
+// SYCL_XARCH_COM_DEVICE_OPTIONS1-SAME: "-mllvm" "-asan-globals=0"
+// SYCL_XARCH_COM_DEVICE_OPTIONS2: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_COM_DEVICE_OPTIONS2-SAME: "-mllvm" "-enable-merge-functions"
+// SYCL_XARCH_COM_NO_DEVICE: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_COM_NO_DEVICE-NOT: USE_XARCH_HOST
+// SYCL_XARCH_COM_NO_DEVICE: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_COM_HOST_OPTIONS1: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_COM_HOST_OPTIONS1-SAME: -fsanitize=memory
+// SYCL_XARCH_COM_HOST_OPTIONS1-NEXT: libclang_rt.msan
+// SYCL_XARCH_COM_HOST_OPTIONS2: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_COM_HOST_OPTIONS2-SAME: USE_XARCH_HOST
+// SYCL_XARCH_COM_HOST_OPTIONS3: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_COM_HOST_OPTIONS3-SAME: -fno-builtin
+// SYCL_XARCH_COM_NO_HOST: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_COM_NO_HOST-NOT: "-mllvm" "-enable-merge-functions"
+
+
+// test behavior of multiple usage of -Xarch_host in single command line
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_MULTIPLE1
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_MULTIPLE2
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_MULTIPLE3
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_HOST_MULTIPLE4
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_NO_DEVICE_MULTIPLE1
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_NO_DEVICE_MULTIPLE2
+// RUN: %clangxx -fsycl %s -Xarch_host "-fsanitize=address -mllvm -enable-merge-functions" \
+// RUN:   -Xarch_host -DFOO -Xarch_host -DFOO1 -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=SYCL_XARCH_NO_DEVICE_MULTIPLE3
+// SYCL_XARCH_HOST_MULTIPLE1: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_MULTIPLE1-SAME: -fsanitize=address
+// SYCL_XARCH_HOST_MULTIPLE1-NEXT: libclang_rt.asan
+// SYCL_XARCH_HOST_MULTIPLE2: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_MULTIPLE2-SAME: "-mllvm" "-enable-merge-functions"
+// SYCL_XARCH_HOST_MULTIPLE3: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_MULTIPLE3-SAME: "FOO"
+// SYCL_XARCH_HOST_MULTIPLE4: clang{{.*}} "-fsycl-is-host"
+// SYCL_XARCH_HOST_MULTIPLE4-SAME: "FOO1"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE1: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE1-NOT: -fsanitize=address
+// SYCL_XARCH_NO_DEVICE_MULTIPLE1: llc{{.*}} "-filetype=obj"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE2: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE2-NOT: "-mllvm" "-enable-merge-functions"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE2: llc{{.*}} "-filetype=obj"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE3: clang{{.*}} "-fsycl-is-device"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE3-NOT: "FOO"
+// SYCL_XARCH_NO_DEVICE_MULTIPLE3: llc{{.*}} "-filetype=obj"