From 526f7e6b150e22c56a4c4312e606e17cb60a3bdc Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 12 Dec 2023 14:08:27 +0000
Subject: [PATCH 01/19] Remove inline from helper

---
 source/adapters/hip/memory.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index 68ded26263..6a220f53c4 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -458,9 +458,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
   return UR_RESULT_SUCCESS;
 }
 
-inline ur_result_t
-allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
-                               const ur_device_handle_t hDevice) {
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
+                                           const ur_device_handle_t hDevice) {
   ScopedContext Active(hDevice);
   ur_lock LockGuard(Mem->MemoryAllocationMutex);
 

From 77b4c1a43080c44394d653d44f2264e88be30ca4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=BDu=C5=BEek?= <peter@codeplay.com>
Date: Wed, 20 Dec 2023 15:17:01 +0000
Subject: [PATCH 02/19] Werror fixes

A bunch of fixes to get `-Werror` builds to pass,
both on Linux and on Windows (`/WX`)

* Disable verbose Windows warnings
  * `_CRT_SECURE_NO_WARNINGS` because of `std::getenv`
  * C4267 because of conversions from `size_t` to other integers
* Define `WIN32_LEAN_AND_MEAN` and `NOMINMAX` on Windows
  * Gets rid of some errors and speeds up the build
* Convert integer CUDA objects to std::uintptr_t
  before reinterpreting them to a pointer
* Fixed "unused function" warning for `GetHipFormatPixelSize`
  * There was a lot of duplication, now a single function
    called `imageElementByteSize`
* Mark some variables as potentially unused (only used in asserts)
* Other minor fixes
---
 CMakeLists.txt                                |  3 --
 cmake/helpers.cmake                           |  8 ++++-
 source/adapters/cuda/device.cpp               |  3 +-
 source/adapters/cuda/image.cpp                |  4 +--
 source/adapters/cuda/program.cpp              |  3 +-
 source/adapters/cuda/sampler.cpp              |  2 +-
 source/adapters/hip/enqueue.cpp               | 21 ++----------
 source/adapters/hip/memory.cpp                | 32 +++----------------
 source/adapters/level_zero/CMakeLists.txt     |  2 +-
 source/adapters/native_cpu/context.cpp        |  8 ++---
 source/common/ur_util.hpp                     |  2 --
 .../cuda/urDeviceCreateWithNativeHandle.cpp   |  4 +--
 test/conformance/source/environment.cpp       |  2 +-
 13 files changed, 29 insertions(+), 65 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbf9947688..fcdf90f173 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,9 +116,6 @@ if(UR_ENABLE_TRACING)
     )
     if (MSVC)
         set(TARGET_XPTI $<IF:$<CONFIG:Release>,xpti,xptid>)
-
-        # disable warning C4267: The compiler detected a conversion from size_t to a smaller type.
-        target_compile_options(xptifw PRIVATE /wd4267)
     else()
         set(TARGET_XPTI xpti)
     endif()
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 35c4789432..74a634ed28 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -84,10 +84,16 @@ function(add_ur_target_compile_options name)
             /W3
             /MD$<$<CONFIG:Debug>:d>
             /GS
+            /DWIN32_LEAN_AND_MEAN
+            /DNOMINMAX
         )
 
         if(UR_DEVELOPER_MODE)
-            target_compile_options(${name} PRIVATE /WX /GS)
+            # _CRT_SECURE_NO_WARNINGS used mainly because of getenv
+            # C4267: The compiler detected a conversion from size_t to a smaller type.
+            target_compile_options(${name} PRIVATE
+                /WX /GS /D_CRT_SECURE_NO_WARNINGS /wd4267
+            )
         endif()
     endif()
 endfunction()
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index 0723cfe4e7..acea59e1a1 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1101,7 +1101,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
     ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) {
-  *phNativeHandle = reinterpret_cast<ur_native_handle_t>(hDevice->get());
+  *phNativeHandle = reinterpret_cast<ur_native_handle_t>(
+      static_cast<std::uintptr_t>(hDevice->get()));
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 1f336dd2d7..3168c008a3 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -146,7 +146,7 @@ urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type,
           std::make_pair(image_channel_type, num_channels));
       cuda_format = cuda_format_and_size.first;
       pixel_size_bytes = cuda_format_and_size.second;
-    } catch (std::out_of_range &e) {
+    } catch (const std::out_of_range &) {
       return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
     }
   }
@@ -276,7 +276,7 @@ ur_result_t urTextureCreate(ur_sampler_handle_t hSampler,
     ImageTexDesc.mipmapFilterMode = MipFilterMode;
     ImageTexDesc.maxMipmapLevelClamp = hSampler->MaxMipmapLevelClamp;
     ImageTexDesc.minMipmapLevelClamp = hSampler->MinMipmapLevelClamp;
-    ImageTexDesc.maxAnisotropy = hSampler->MaxAnisotropy;
+    ImageTexDesc.maxAnisotropy = static_cast<unsigned>(hSampler->MaxAnisotropy);
 
     // The address modes can interfere with other dimensionsenqueueEventsWait
     // e.g. 1D texture sampling can be interfered with when setting other
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 9b7959eb85..022fd258f7 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -141,7 +141,8 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
         getMaxRegistersJitOptionValue(this->BuildOptions, MaxRegs);
     if (Valid) {
       Options.push_back(CU_JIT_MAX_REGISTERS);
-      OptionVals.push_back(reinterpret_cast<void *>(MaxRegs));
+      OptionVals.push_back(
+          reinterpret_cast<void *>(static_cast<std::uintptr_t>(MaxRegs)));
     }
   }
 
diff --git a/source/adapters/cuda/sampler.cpp b/source/adapters/cuda/sampler.cpp
index ce4283edd3..5ebccf516b 100644
--- a/source/adapters/cuda/sampler.cpp
+++ b/source/adapters/cuda/sampler.cpp
@@ -18,7 +18,7 @@ urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc,
       new ur_sampler_handle_t_(hContext)};
 
   if (pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) {
-    Sampler->Props |= pDesc->normalizedCoords;
+    Sampler->Props |= static_cast<uint32_t>(pDesc->normalizedCoords);
     Sampler->Props |= pDesc->filterMode << 1;
     Sampler->Props |= pDesc->addressingMode << 2;
   } else {
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 5f7fffba35..c9c5af6453 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -15,26 +15,9 @@
 #include "memory.hpp"
 #include "queue.hpp"
 
-namespace {
+extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
 
-static size_t imageElementByteSize(hipArray_Format ArrayFormat) {
-  switch (ArrayFormat) {
-  case HIP_AD_FORMAT_UNSIGNED_INT8:
-  case HIP_AD_FORMAT_SIGNED_INT8:
-    return 1;
-  case HIP_AD_FORMAT_UNSIGNED_INT16:
-  case HIP_AD_FORMAT_SIGNED_INT16:
-  case HIP_AD_FORMAT_HALF:
-    return 2;
-  case HIP_AD_FORMAT_UNSIGNED_INT32:
-  case HIP_AD_FORMAT_SIGNED_INT32:
-  case HIP_AD_FORMAT_FLOAT:
-    return 4;
-  default:
-    detail::ur::die("Invalid image format.");
-  }
-  return 0;
-}
+namespace {
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
                               uint32_t NumEventsInWaitList,
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index 68ded26263..aa3abbdaf4 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -13,10 +13,8 @@
 #include <cassert>
 #include <ur_util.hpp>
 
-namespace {
-
-size_t GetHipFormatPixelSize(hipArray_Format Format) {
-  switch (Format) {
+size_t imageElementByteSize(hipArray_Format ArrayFormat) {
+  switch (ArrayFormat) {
   case HIP_AD_FORMAT_UNSIGNED_INT8:
   case HIP_AD_FORMAT_SIGNED_INT8:
     return 1;
@@ -31,10 +29,9 @@ size_t GetHipFormatPixelSize(hipArray_Format Format) {
   default:
     detail::ur::die("Invalid HIP format specifier");
   }
+  return 0;
 }
 
-} // namespace
-
 /// Decreases the reference count of the Mem object.
 /// If this is zero, calls the relevant HIP Free function
 /// \return UR_RESULT_SUCCESS unless deallocation error
@@ -245,7 +242,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
           UR_CHECK_ERROR(
               hipArray3DGetDescriptor(&ArrayDescriptor, Mem.getArray(Device)));
           const auto PixelSizeBytes =
-              GetHipFormatPixelSize(ArrayDescriptor.Format) *
+              imageElementByteSize(ArrayDescriptor.Format) *
               ArrayDescriptor.NumChannels;
           const auto ImageSizeBytes =
               PixelSizeBytes *
@@ -405,25 +402,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
       }
     };
 
-    const auto hipFormatToElementSize =
-        [](hipArray_Format HipFormat) -> size_t {
-      switch (HipFormat) {
-      case HIP_AD_FORMAT_UNSIGNED_INT8:
-      case HIP_AD_FORMAT_SIGNED_INT8:
-        return 1;
-      case HIP_AD_FORMAT_UNSIGNED_INT16:
-      case HIP_AD_FORMAT_SIGNED_INT16:
-      case HIP_AD_FORMAT_HALF:
-        return 2;
-      case HIP_AD_FORMAT_UNSIGNED_INT32:
-      case HIP_AD_FORMAT_SIGNED_INT32:
-      case HIP_AD_FORMAT_FLOAT:
-        return 4;
-      default:
-        detail::ur::die("Invalid Hip format specified.");
-      }
-    };
-
     switch (propName) {
     case UR_IMAGE_INFO_FORMAT:
       return ReturnValue(ur_image_format_t{UR_IMAGE_CHANNEL_ORDER_RGBA,
@@ -435,7 +413,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
     case UR_IMAGE_INFO_DEPTH:
       return ReturnValue(ArrayInfo.Depth);
     case UR_IMAGE_INFO_ELEMENT_SIZE:
-      return ReturnValue(hipFormatToElementSize(ArrayInfo.Format));
+      return ReturnValue(imageElementByteSize(ArrayInfo.Format));
     case UR_IMAGE_INFO_ROW_PITCH:
     case UR_IMAGE_INFO_SLICE_PITCH:
       return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 7203d5a238..250eaccab2 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -124,7 +124,7 @@ add_ur_adapter(${TARGET_NAME}
 
 # TODO: fix level_zero adapter conversion warnings
 target_compile_options(${TARGET_NAME} PRIVATE
-    $<$<CXX_COMPILER_ID:MSVC>:/wd4267 /wd4805 /wd4244 /D_CRT_SECURE_NO_WARNINGS>
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244>
 )
 
 set_target_properties(${TARGET_NAME} PROPERTIES
diff --git a/source/adapters/native_cpu/context.cpp b/source/adapters/native_cpu/context.cpp
index 962525d1fc..c485725828 100644
--- a/source/adapters/native_cpu/context.cpp
+++ b/source/adapters/native_cpu/context.cpp
@@ -17,10 +17,10 @@
 #include "common.hpp"
 #include "context.hpp"
 
-UR_APIEXPORT ur_result_t UR_APICALL
-urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
-                const ur_context_properties_t *pProperties,
-                ur_context_handle_t *phContext) {
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
+    [[maybe_unused]] uint32_t DeviceCount, const ur_device_handle_t *phDevices,
+    const ur_context_properties_t *pProperties,
+    ur_context_handle_t *phContext) {
   std::ignore = pProperties;
   assert(DeviceCount == 1);
 
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index 5a34aa6584..00fc29eddd 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -22,8 +22,6 @@
 #include <vector>
 
 #ifdef _WIN32
-#define NOMINMAX
-
 #include <windows.h>
 inline int ur_getpid(void) { return static_cast<int>(GetCurrentProcessId()); }
 #else
diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
index 3b8ebc416b..dca7932606 100644
--- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
+++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
@@ -15,8 +15,8 @@ TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
     CUdevice cudaDevice;
     ASSERT_SUCCESS_CUDA(cuDeviceGet(&cudaDevice, 0));
 
-    ur_native_handle_t nativeCuda =
-        reinterpret_cast<ur_native_handle_t>(cudaDevice);
+    ur_native_handle_t nativeCuda = reinterpret_cast<ur_native_handle_t>(
+        static_cast<std::uintptr_t>(cudaDevice));
     ur_device_handle_t urDevice;
     ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr,
                                                   &urDevice));
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index 6c917914ed..11a8b501a8 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -281,7 +281,7 @@ DevicesEnvironment::DevicesEnvironment(int argc, char **argv)
             error = "urDeviceGet() failed to get devices.";
             return;
         }
-        for (u_long i = 0; i < count; i++) {
+        for (unsigned i = 0; i < count; i++) {
             size_t size;
             if (urDeviceGetInfo(devices[i], UR_DEVICE_INFO_NAME, 0, nullptr,
                                 &size)) {

From 67c3779cbb13ca68f339519ff15f46faa4d71fbd Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 12 Dec 2023 09:43:09 +0000
Subject: [PATCH 03/19] AMDGPU enable global variable read write

---
 source/adapters/hip/enqueue.cpp | 68 ++++++++++++++++++++++++++++++---
 source/adapters/hip/program.cpp | 18 +++++++++
 source/adapters/hip/program.hpp |  2 +
 3 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 7875650b85..ff49e5506a 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1545,15 +1545,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
-    ur_queue_handle_t, ur_program_handle_t, const char *, bool, size_t, size_t,
-    const void *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingWrite, size_t count, size_t offset, const void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  // Since HIP requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    hipDeviceptr_t DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    UR_CHECK_ERROR(hipModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                      hProgram->get(),
+                                      DeviceGlobalName.c_str()));
+
+    if (offset + count > DeviceGlobalSize)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingWrite,
+        reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(DeviceGlobal) +
+                                 offset),
+        pSrc, count, numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
-    ur_queue_handle_t, ur_program_handle_t, const char *, bool, size_t, size_t,
-    void *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingRead, size_t count, size_t offset, void *pDst,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  // Since HIP requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    hipDeviceptr_t DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    UR_CHECK_ERROR(hipModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                      hProgram->get(),
+                                      DeviceGlobalName.c_str()));
+
+    if (offset + count > DeviceGlobalSize)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingRead, pDst,
+        reinterpret_cast<const void *>(
+            reinterpret_cast<uint8_t *>(DeviceGlobal) + offset),
+        count, numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe(
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index 9aa64151e0..81f1be1194 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -78,6 +78,15 @@ void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
 } // namespace
 #endif
 
+std::pair<std::string, std::string>
+splitMetadataName(const std::string &metadataName) {
+  size_t splitPos = metadataName.rfind('@');
+  if (splitPos == std::string::npos)
+    return std::make_pair(metadataName, std::string{});
+  return std::make_pair(metadataName.substr(0, splitPos),
+                        metadataName.substr(splitPos, metadataName.length()));
+}
+
 ur_result_t
 ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
                                   size_t Length) {
@@ -85,10 +94,19 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
     const ur_program_metadata_t MetadataElement = Metadata[i];
     std::string MetadataElementName{MetadataElement.pName};
 
+    auto [Prefix, Tag] = splitMetadataName(MetadataElementName);
+
     if (MetadataElementName ==
         __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) {
       assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32);
       IsRelocatable = MetadataElement.value.data32;
+    } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
+      const char *MetadataValPtr =
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      const char *MetadataValPtrEnd =
+          MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
+      GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
     }
   }
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp
index 4b4e5ec878..dbdf9c55c6 100644
--- a/source/adapters/hip/program.hpp
+++ b/source/adapters/hip/program.hpp
@@ -29,6 +29,8 @@ struct ur_program_handle_t_ {
   // Metadata
   bool IsRelocatable = false;
 
+  std::unordered_map<std::string, std::string> GlobalIDMD;
+
   constexpr static size_t MAX_LOG_SIZE = 8192u;
 
   char ErrorLog[MAX_LOG_SIZE], InfoLog[MAX_LOG_SIZE];

From c9fba562feb5ea5acfbcc0dbbc5d8a15e9383a2b Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Fri, 15 Dec 2023 10:07:36 +0000
Subject: [PATCH 04/19] Update source/adapters/hip/enqueue.cpp

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 source/adapters/hip/enqueue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index ff49e5506a..56dfd20948 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1549,7 +1549,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
     bool blockingWrite, size_t count, size_t offset, const void *pSrc,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  // Since HIP requires a the global variable to be referenced by name, we use
+  // Since HIP requires the global variable to be referenced by name, we use
   // metadata to find the correct name to access it by.
   auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
   if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())

From de02e990ea8876db0d54903c6b3e3af8ce88ce6d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Fri, 15 Dec 2023 10:07:58 +0000
Subject: [PATCH 05/19] Update source/adapters/hip/enqueue.cpp

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 source/adapters/hip/enqueue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 56dfd20948..144191bd35 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1583,7 +1583,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
     bool blockingRead, size_t count, size_t offset, void *pDst,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  // Since HIP requires a the global variable to be referenced by name, we use
+  // Since HIP requires the global variable to be referenced by name, we use
   // metadata to find the correct name to access it by.
   auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
   if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())

From 45d76b7817c9654a8ebbbd0a02744f7ceb753227 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 20 Dec 2023 11:42:37 +0000
Subject: [PATCH 06/19] Refactor read write funcs

---
 source/adapters/hip/enqueue.cpp | 73 +++++++++++++++------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 144191bd35..0e7e04fc45 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1544,19 +1544,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
   return Result;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
+namespace {
+
+enum class GlobalVariableCopy { Read, Write };
+
+ur_result_t deviceGlobalCopyHelper(
     ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
-    bool blockingWrite, size_t count, size_t offset, const void *pSrc,
+    bool blocking, size_t count, size_t offset, void *ptr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_event_handle_t *phEvent) {
-  // Since HIP requires the global variable to be referenced by name, we use
+    ur_event_handle_t *phEvent, GlobalVariableCopy CopyType) {
+  // Since HIP requires a the global variable to be referenced by name, we use
   // metadata to find the correct name to access it by.
   auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
   if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
     return UR_RESULT_ERROR_INVALID_VALUE;
   std::string DeviceGlobalName = DeviceGlobalNameIt->second;
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
     hipDeviceptr_t DeviceGlobal = 0;
     size_t DeviceGlobalSize = 0;
@@ -1567,15 +1570,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
     if (offset + count > DeviceGlobalSize)
       return UR_RESULT_ERROR_INVALID_VALUE;
 
-    return urEnqueueUSMMemcpy(
-        hQueue, blockingWrite,
-        reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(DeviceGlobal) +
-                                 offset),
-        pSrc, count, numEventsInWaitList, phEventWaitList, phEvent);
+    void *pSrc, *pDst;
+    if (CopyType == GlobalVariableCopy::Write) {
+      pSrc = ptr;
+      pDst = reinterpret_cast<uint8_t *>(DeviceGlobal) + offset;
+    } else {
+      pSrc = reinterpret_cast<uint8_t *>(DeviceGlobal) + offset;
+      pDst = ptr;
+    }
+    return urEnqueueUSMMemcpy(hQueue, blocking, pDst, pSrc, count,
+                              numEventsInWaitList, phEventWaitList, phEvent);
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+}
+} // namespace
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingWrite, size_t count, size_t offset, const void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  return deviceGlobalCopyHelper(hQueue, hProgram, name, blockingWrite, count,
+                                offset, const_cast<void *>(pSrc),
+                                numEventsInWaitList, phEventWaitList, phEvent,
+                                GlobalVariableCopy::Write);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
@@ -1583,33 +1602,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
     bool blockingRead, size_t count, size_t offset, void *pDst,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  // Since HIP requires the global variable to be referenced by name, we use
-  // metadata to find the correct name to access it by.
-  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
-  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
-
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  try {
-    hipDeviceptr_t DeviceGlobal = 0;
-    size_t DeviceGlobalSize = 0;
-    UR_CHECK_ERROR(hipModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
-                                      hProgram->get(),
-                                      DeviceGlobalName.c_str()));
-
-    if (offset + count > DeviceGlobalSize)
-      return UR_RESULT_ERROR_INVALID_VALUE;
-
-    return urEnqueueUSMMemcpy(
-        hQueue, blockingRead, pDst,
-        reinterpret_cast<const void *>(
-            reinterpret_cast<uint8_t *>(DeviceGlobal) + offset),
-        count, numEventsInWaitList, phEventWaitList, phEvent);
-  } catch (ur_result_t Err) {
-    Result = Err;
-  }
-  return Result;
+  return deviceGlobalCopyHelper(
+      hQueue, hProgram, name, blockingRead, count, offset, pDst,
+      numEventsInWaitList, phEventWaitList, phEvent, GlobalVariableCopy::Read);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe(

From 2e156f790c2e8473abba198f91b544c1db0ca526 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Fri, 3 Nov 2023 10:25:52 +0000
Subject: [PATCH 07/19] [EXP][CMDBUF] Add extra event to get CommandBuffer
 start time

Adds an extra event in the first command list associated to the CommandBuffer execution to obtain the start time of the graph execution.
---
 source/adapters/level_zero/command_buffer.cpp | 19 +++++++++++++++++++
 source/adapters/level_zero/event.cpp          | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 4b811ab033..af5fa73077 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -941,6 +941,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ZE2UR_CALL(zeCommandListAppendBarrier,
                (SignalCommandList->first, RetEvent->ZeEvent, 1,
                 &(CommandBuffer->SignalEvent->ZeEvent)));
+
+    if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+      // We create an additional signal specific to the current execution of the
+      // CommandBuffer. This signal is needed for profiling the execution time
+      // of the CommandBuffer. It waits for the WaitEvent to be signaled
+      // which indicates the start of the CommandBuffer actual execution.
+      // This event is embedded into the Event return to the user to allow
+      // the profiling engine to retrieve it.
+      ur_event_handle_t StartEvent{};
+      UR_CALL(createEventAndAssociateQueue(
+          Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
+          WaitCommandList, false));
+
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+                 (WaitCommandList->first, StartEvent->ZeEvent, 1,
+                  &(CommandBuffer->WaitEvent->ZeEvent)));
+
+      RetEvent->CommandData = StartEvent;
+    }
   }
 
   // Execution our command-lists asynchronously
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index d8af1e674d..8dfef4f099 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -13,6 +13,7 @@
 #include <mutex>
 #include <string.h>
 
+#include "command_buffer.hpp"
 #include "common.hpp"
 #include "event.hpp"
 #include "ur_level_zero.hpp"
@@ -454,6 +455,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
                              ///< bytes returned in propValue
 ) {
   std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+
+  // A Command-buffer consists of three command-lists.
+  // The start time should therefore be taken from an event associated
+  // to the first command-list.
+  if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) &&
+      (PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) {
+    auto StartEvent = static_cast<ur_event_handle_t>(Event->CommandData);
+    return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END,
+                                   PropValueSize, PropValue, PropValueSizeRet);
+  }
+
   if (Event->UrQueue &&
       (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
@@ -763,6 +775,13 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
       return Res;
     Event->CommandData = nullptr;
   }
+  if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
+      Event->CommandData) {
+    // Free the memory extra event allocated for profiling purposed.
+    auto AssociateEvent = static_cast<ur_event_handle_t>(Event->CommandData);
+    urEventRelease(AssociateEvent);
+    Event->CommandData = nullptr;
+  }
   if (Event->OwnNativeHandle) {
     if (DisableEventsCaching) {
       auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));

From e8b78408b0b2d6547b7aa07ed3599a54484d6a2e Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Thu, 9 Nov 2023 16:44:33 +0000
Subject: [PATCH 08/19] Copy command-buffer event timestamps into a dedicated
 USM memory region. Get the command-buffer start and end timestamps from this
 memory. Move events reset from the middle command list to the first to allow
 the copy of the profiling info in the last command list and relax command
 list order.

---
 source/adapters/level_zero/command_buffer.cpp | 92 ++++++++++++-------
 source/adapters/level_zero/command_buffer.hpp |  5 +
 source/adapters/level_zero/event.cpp          | 80 +++++++++++++---
 3 files changed, 131 insertions(+), 46 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index af5fa73077..d63630c456 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -45,13 +45,13 @@
   │  Prefix  │ Commands added to UR command-buffer by UR user │ Suffix  │
   └──────────┴────────────────────────────────────────────────┴─────────┘
 
-            ┌───────────────────┬──────────────────────────────┐
-  Prefix    │Reset signal event │ Barrier waiting on wait event│
-            └───────────────────┴──────────────────────────────┘
+            ┌───────────────────┬──────────────┐──────────────────────────────┐
+  Prefix    │Reset signal event │ Reset events │ Barrier waiting on wait event│
+            └───────────────────┴──────────────┘──────────────────────────────┘
 
             ┌─────────────────────────────────────────────┐──────────────┐
-  Suffix    │Barrier waiting on sync-point event,         │ Reset events │
-            │signalling the UR command-buffer signal event│              │
+  Suffix    │Barrier waiting on sync-point event,         │  Query CMD   │
+            │signalling the UR command-buffer signal event│  Timestamps  │
             └─────────────────────────────────────────────┘──────────────┘
 
   For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
 
   ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
   ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
+  // Dependencies between commands are explicitly enforced by sync points when
+  // enqueuing. Consequently, relax the command ordering in the command list
+  // can enable the backend to further optimize the workload
+  ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
 
   ze_command_list_handle_t ZeCommandList;
   // TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -499,13 +503,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
   ZE2UR_CALL(zeCommandListAppendEventReset,
              (CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));
 
-  // Reset the L0 events we use for command-buffer internal sync-points to the
-  // non-signalled state
-  for (auto Event : WaitEventList) {
-    ZE2UR_CALL(zeCommandListAppendEventReset,
-               (CommandBuffer->ZeCommandList, Event));
-  }
-
   // Close the command list and have it ready for dispatch.
   ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
   return UR_RESULT_SUCCESS;
@@ -899,14 +896,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // Create command-list to execute before `CommandListPtr` and will signal
   // when `EventWaitList` dependencies are complete.
   ur_command_list_ptr_t WaitCommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false,
+                                                  false));
+
+  // Create a list of events of all the events that compose the command buffer
+  // workload.
+  // This loop also resets the L0 events we use for command-buffer internal
+  // sync-points to the non-signalled state.
+  // This is required for multiple submissions.
+  const size_t NumEvents = CommandBuffer->SyncPoints.size();
+  std::vector<ze_event_handle_t> WaitEventList{NumEvents};
+  for (size_t i = 0; i < NumEvents; i++) {
+    auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent;
+    WaitEventList[i] = ZeEvent;
+    ZE2UR_CALL(zeCommandListAppendEventReset,
+               (WaitCommandList->first, ZeEvent));
+  }
+
   if (NumEventsInWaitList) {
     _ur_ze_event_list_t TmpWaitList;
     UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
         NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
 
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
-                                                    false, false))
-
     // Update the WaitList of the Wait Event
     // Events are appended to the WaitList if the WaitList is not empty
     if (CommandBuffer->WaitEvent->WaitList.isEmpty())
@@ -919,9 +930,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
                 CommandBuffer->WaitEvent->WaitList.Length,
                 CommandBuffer->WaitEvent->WaitList.ZeEventList));
   } else {
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
-                                                    false, false));
-
     ZE2UR_CALL(zeCommandListAppendSignalEvent,
                (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
   }
@@ -943,22 +951,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
                 &(CommandBuffer->SignalEvent->ZeEvent)));
 
     if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
-      // We create an additional signal specific to the current execution of the
-      // CommandBuffer. This signal is needed for profiling the execution time
-      // of the CommandBuffer. It waits for the WaitEvent to be signaled
-      // which indicates the start of the CommandBuffer actual execution.
-      // This event is embedded into the Event return to the user to allow
-      // the profiling engine to retrieve it.
-      ur_event_handle_t StartEvent{};
-      UR_CALL(createEventAndAssociateQueue(
-          Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
-          WaitCommandList, false));
-
-      ZE2UR_CALL(zeCommandListAppendBarrier,
-                 (WaitCommandList->first, StartEvent->ZeEvent, 1,
-                  &(CommandBuffer->WaitEvent->ZeEvent)));
-
-      RetEvent->CommandData = StartEvent;
+      // Multiple submissions of a command buffer implies that we need to save
+      // the event timestamps before resubmiting the command buffer. We
+      // therefore copy the these timestamps in a dedicated USM memory section
+      // before completing the command buffer execution, and then attach this
+      // memory to the event returned to users to allow to allow the profiling
+      // engine to recover these timestamps.
+      ur_usm_desc_t USMDesc{};
+      ur_usm_device_desc_t UsmDeviceDesc{};
+      UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
+      ur_usm_host_desc_t UsmHostDesc{};
+      UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
+      UsmDeviceDesc.pNext = &UsmHostDesc;
+      USMDesc.pNext = &UsmDeviceDesc;
+      USMDesc.align = 4; // 4byte-aligned
+
+      size_t Size = WaitEventList.size() * sizeof(ze_kernel_timestamp_result_t);
+
+      struct command_buffer_profiling_t *Profiling =
+          new command_buffer_profiling_t();
+
+      Profiling->NumEvents = WaitEventList.size();
+
+      urUSMSharedAlloc(RetEvent->Context, CommandBuffer->Device, &USMDesc,
+                       nullptr, Size, (void **)&Profiling->Timestamps);
+
+      ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
+                 (SignalCommandList->first, WaitEventList.size(),
+                  WaitEventList.data(), Profiling->Timestamps, 0,
+                  RetEvent->ZeEvent, 1,
+                  &(CommandBuffer->SignalEvent->ZeEvent)));
+
+      RetEvent->CommandData = static_cast<void *>(Profiling);
     }
   }
 
diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp
index b18f1c3497..a43e9e4c52 100644
--- a/source/adapters/level_zero/command_buffer.hpp
+++ b/source/adapters/level_zero/command_buffer.hpp
@@ -19,6 +19,11 @@
 #include "context.hpp"
 #include "queue.hpp"
 
+struct command_buffer_profiling_t {
+  ur_exp_command_buffer_sync_point_t NumEvents;
+  ze_kernel_timestamp_result_t *Timestamps;
+};
+
 struct ur_exp_command_buffer_handle_t_ : public _ur_object {
   ur_exp_command_buffer_handle_t_(ur_context_handle_t Context,
                                   ur_device_handle_t Device,
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 8dfef4f099..5e9397e4e7 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -456,16 +456,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 ) {
   std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
 
-  // A Command-buffer consists of three command-lists.
-  // The start time should therefore be taken from an event associated
-  // to the first command-list.
-  if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) &&
-      (PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) {
-    auto StartEvent = static_cast<ur_event_handle_t>(Event->CommandData);
-    return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END,
-                                   PropValueSize, PropValue, PropValueSizeRet);
-  }
-
   if (Event->UrQueue &&
       (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
@@ -482,6 +472,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 
   ze_kernel_timestamp_result_t tsResult;
 
+  // A Command-buffer consists of three command-lists for which only a single
+  // event is returned to users. The actual profiling information related to the
+  // command-buffer should therefore be extrated from graph events themsleves.
+  // The timestamps of these events are saved in a memory region attached to
+  // event usning CommandData field. The timings must therefore be recovered
+  // from this memory.
+  if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
+    if (Event->CommandData) {
+      struct command_buffer_profiling_t *ProfilingsPtr;
+      switch (PropName) {
+      case UR_PROFILING_INFO_COMMAND_START: {
+        ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
+            Event->CommandData);
+        // Sync-point order does not necessarily match to the order of
+        // execution. We therefore look for the first command executed.
+        uint64_t MinStart = ProfilingsPtr->Timestamps->global.kernelStart;
+        for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
+          uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
+          if (Timestamp < MinStart) {
+            MinStart = Timestamp;
+          }
+        }
+        uint64_t ContextStartTime =
+            (MinStart & TimestampMaxValue) * ZeTimerResolution;
+        return ReturnValue(ContextStartTime);
+      }
+      case UR_PROFILING_INFO_COMMAND_END: {
+        ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
+            Event->CommandData);
+        // Sync-point order does not necessarily match to the order of
+        // execution. We therefore look for the last command executed.
+        uint64_t MaxEnd = ProfilingsPtr->Timestamps->global.kernelEnd;
+        uint64_t LastStart = ProfilingsPtr->Timestamps->global.kernelStart;
+        for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
+          uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
+          if (Timestamp > MaxEnd) {
+            MaxEnd = Timestamp;
+            LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart;
+          }
+        }
+        uint64_t ContextStartTime = (LastStart & TimestampMaxValue);
+        uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue);
+
+        //
+        // Handle a possible wrap-around (the underlying HW counter is <
+        // 64-bit). Note, it will not report correct time if there were multiple
+        // wrap arounds, and the longer term plan is to enlarge the capacity of
+        // the HW timestamps.
+        //
+        if (ContextEndTime <= ContextStartTime) {
+          ContextEndTime += TimestampMaxValue;
+        }
+        ContextEndTime *= ZeTimerResolution;
+        return ReturnValue(ContextEndTime);
+      }
+      default:
+        urPrint("urEventGetProfilingInfo: not supported ParamName\n");
+        return UR_RESULT_ERROR_INVALID_VALUE;
+      }
+    } else {
+      return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
+    }
+  }
+
   switch (PropName) {
   case UR_PROFILING_INFO_COMMAND_START: {
     ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
@@ -778,8 +832,10 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
       Event->CommandData) {
     // Free the memory extra event allocated for profiling purposed.
-    auto AssociateEvent = static_cast<ur_event_handle_t>(Event->CommandData);
-    urEventRelease(AssociateEvent);
+    struct command_buffer_profiling_t *ProfilingPtr =
+        static_cast<struct command_buffer_profiling_t *>(Event->CommandData);
+    urUSMFree(Event->Context, (void *)ProfilingPtr->Timestamps);
+    delete ProfilingPtr;
     Event->CommandData = nullptr;
   }
   if (Event->OwnNativeHandle) {

From bd25d685cde0c23f66d1b4e8e499e3da7bdafddd Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Fri, 10 Nov 2023 17:29:42 +0000
Subject: [PATCH 09/19] Changes USMShared memory allocation for host only
 allocation

---
 source/adapters/level_zero/command_buffer.cpp | 33 +++++++------------
 source/adapters/level_zero/event.cpp          | 22 ++++++-------
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index d63630c456..7f3f514d9d 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -939,6 +939,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // Create a command-list to signal RetEvent on completion
   ur_command_list_ptr_t SignalCommandList{};
   if (Event) {
+    ur_event_handle_t SyncEvent = CommandBuffer->SignalEvent;
     UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
                                                     false, false));
 
@@ -946,10 +947,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
                                          UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
                                          SignalCommandList, false));
 
-    ZE2UR_CALL(zeCommandListAppendBarrier,
-               (SignalCommandList->first, RetEvent->ZeEvent, 1,
-                &(CommandBuffer->SignalEvent->ZeEvent)));
-
     if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
       // Multiple submissions of a command buffer implies that we need to save
       // the event timestamps before resubmiting the command buffer. We
@@ -957,33 +954,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       // before completing the command buffer execution, and then attach this
       // memory to the event returned to users to allow to allow the profiling
       // engine to recover these timestamps.
-      ur_usm_desc_t USMDesc{};
-      ur_usm_device_desc_t UsmDeviceDesc{};
-      UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
-      ur_usm_host_desc_t UsmHostDesc{};
-      UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
-      UsmDeviceDesc.pNext = &UsmHostDesc;
-      USMDesc.pNext = &UsmDeviceDesc;
-      USMDesc.align = 4; // 4byte-aligned
-
-      size_t Size = WaitEventList.size() * sizeof(ze_kernel_timestamp_result_t);
+      UR_CALL(createEventAndAssociateQueue(
+          Queue, &SyncEvent, UR_COMMAND_USM_MEMCPY, SignalCommandList, false));
 
-      struct command_buffer_profiling_t *Profiling =
-          new command_buffer_profiling_t();
+      command_buffer_profiling_t *Profiling = new command_buffer_profiling_t();
 
       Profiling->NumEvents = WaitEventList.size();
-
-      urUSMSharedAlloc(RetEvent->Context, CommandBuffer->Device, &USMDesc,
-                       nullptr, Size, (void **)&Profiling->Timestamps);
+      Profiling->Timestamps =
+          new ze_kernel_timestamp_result_t[Profiling->NumEvents];
 
       ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
                  (SignalCommandList->first, WaitEventList.size(),
-                  WaitEventList.data(), Profiling->Timestamps, 0,
-                  RetEvent->ZeEvent, 1,
+                  WaitEventList.data(), (void *)Profiling->Timestamps, 0,
+                  SyncEvent->ZeEvent, 1,
                   &(CommandBuffer->SignalEvent->ZeEvent)));
 
       RetEvent->CommandData = static_cast<void *>(Profiling);
     }
+
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (SignalCommandList->first, RetEvent->ZeEvent, 1,
+                &(SyncEvent->ZeEvent)));
   }
 
   // Execution our command-lists asynchronously
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 5e9397e4e7..2dc74ff5ac 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -480,14 +480,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   // from this memory.
   if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
     if (Event->CommandData) {
-      struct command_buffer_profiling_t *ProfilingsPtr;
+      command_buffer_profiling_t *ProfilingsPtr;
       switch (PropName) {
       case UR_PROFILING_INFO_COMMAND_START: {
-        ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
-            Event->CommandData);
+        ProfilingsPtr =
+            static_cast<command_buffer_profiling_t *>(Event->CommandData);
         // Sync-point order does not necessarily match to the order of
         // execution. We therefore look for the first command executed.
-        uint64_t MinStart = ProfilingsPtr->Timestamps->global.kernelStart;
+        uint64_t MinStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
         for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
           uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
           if (Timestamp < MinStart) {
@@ -499,12 +499,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
         return ReturnValue(ContextStartTime);
       }
       case UR_PROFILING_INFO_COMMAND_END: {
-        ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
-            Event->CommandData);
+        ProfilingsPtr =
+            static_cast<command_buffer_profiling_t *>(Event->CommandData);
         // Sync-point order does not necessarily match to the order of
         // execution. We therefore look for the last command executed.
-        uint64_t MaxEnd = ProfilingsPtr->Timestamps->global.kernelEnd;
-        uint64_t LastStart = ProfilingsPtr->Timestamps->global.kernelStart;
+        uint64_t MaxEnd = ProfilingsPtr->Timestamps[0].global.kernelEnd;
+        uint64_t LastStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
         for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
           uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
           if (Timestamp > MaxEnd) {
@@ -832,9 +832,9 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
       Event->CommandData) {
     // Free the memory extra event allocated for profiling purposed.
-    struct command_buffer_profiling_t *ProfilingPtr =
-        static_cast<struct command_buffer_profiling_t *>(Event->CommandData);
-    urUSMFree(Event->Context, (void *)ProfilingPtr->Timestamps);
+    command_buffer_profiling_t *ProfilingPtr =
+        static_cast<command_buffer_profiling_t *>(Event->CommandData);
+    delete[] ProfilingPtr->Timestamps;
     delete ProfilingPtr;
     Event->CommandData = nullptr;
   }

From 1db8fbf4f0c775719cdd2b42de58ddb978ea4e47 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Mon, 13 Nov 2023 12:02:23 +0000
Subject: [PATCH 10/19] Fixes event leak

---
 source/adapters/level_zero/command_buffer.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 7f3f514d9d..579be4447d 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -939,7 +939,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // Create a command-list to signal RetEvent on completion
   ur_command_list_ptr_t SignalCommandList{};
   if (Event) {
-    ur_event_handle_t SyncEvent = CommandBuffer->SignalEvent;
     UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
                                                     false, false));
 
@@ -954,9 +953,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       // before completing the command buffer execution, and then attach this
       // memory to the event returned to users to allow to allow the profiling
       // engine to recover these timestamps.
-      UR_CALL(createEventAndAssociateQueue(
-          Queue, &SyncEvent, UR_COMMAND_USM_MEMCPY, SignalCommandList, false));
-
       command_buffer_profiling_t *Profiling = new command_buffer_profiling_t();
 
       Profiling->NumEvents = WaitEventList.size();
@@ -966,15 +962,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
                  (SignalCommandList->first, WaitEventList.size(),
                   WaitEventList.data(), (void *)Profiling->Timestamps, 0,
-                  SyncEvent->ZeEvent, 1,
+                  RetEvent->ZeEvent, 1,
                   &(CommandBuffer->SignalEvent->ZeEvent)));
 
       RetEvent->CommandData = static_cast<void *>(Profiling);
+    } else {
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+		 (SignalCommandList->first, RetEvent->ZeEvent, 1,
+		  &(CommandBuffer->SignalEvent->ZeEvent)));
     }
-
-    ZE2UR_CALL(zeCommandListAppendBarrier,
-               (SignalCommandList->first, RetEvent->ZeEvent, 1,
-                &(SyncEvent->ZeEvent)));
   }
 
   // Execution our command-lists asynchronously

From 35b6a5eab04d57040eb0cf88866c8ef475f27bc9 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Wed, 15 Nov 2023 12:12:29 +0000
Subject: [PATCH 11/19] Moves wait-event reset from main CL to suffix CL

---
 source/adapters/level_zero/command_buffer.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 579be4447d..bbe49cb705 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -498,11 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
              (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent,
               NumEvents, WaitEventList.data()));
 
-  // Reset the wait-event for the UR command-buffer that is signalled when its
-  // submission dependencies have been satisfied.
-  ZE2UR_CALL(zeCommandListAppendEventReset,
-             (CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));
-
   // Close the command list and have it ready for dispatch.
   ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
   return UR_RESULT_SUCCESS;
@@ -938,10 +933,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   ur_event_handle_t RetEvent{};
   // Create a command-list to signal RetEvent on completion
   ur_command_list_ptr_t SignalCommandList{};
-  if (Event) {
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
-                                                    false, false));
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
+                                                  false, false));
+  // Reset the wait-event for the UR command-buffer that is signalled when its
+  // submission dependencies have been satisfied.
+  ZE2UR_CALL(zeCommandListAppendEventReset,
+             (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
 
+  if (Event) {
     UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent,
                                          UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
                                          SignalCommandList, false));
@@ -968,8 +967,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       RetEvent->CommandData = static_cast<void *>(Profiling);
     } else {
       ZE2UR_CALL(zeCommandListAppendBarrier,
-		 (SignalCommandList->first, RetEvent->ZeEvent, 1,
-		  &(CommandBuffer->SignalEvent->ZeEvent)));
+                 (SignalCommandList->first, RetEvent->ZeEvent, 1,
+                  &(CommandBuffer->SignalEvent->ZeEvent)));
     }
   }
 

From e6e822ad3bfbc611e3f7921e305c07346cbe543b Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Thu, 11 Jan 2024 16:13:34 +0000
Subject: [PATCH 12/19] [HIP] Fix maybe uninitialized warnings

---
 source/adapters/hip/enqueue.cpp | 16 ++++++++--------
 source/adapters/hip/kernel.cpp  |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 68e3e665d2..a75d4ef9d1 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1071,8 +1071,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
 
     hipArray *Array = std::get<SurfaceMem>(hImage->Mem).getArray(Device);
 
-    hipArray_Format Format;
-    size_t NumChannels;
+    hipArray_Format Format{};
+    size_t NumChannels{};
     UR_CHECK_ERROR(getArrayDesc(Array, Format, NumChannels));
 
     int ElementByteSize = imageElementByteSize(Format);
@@ -1132,8 +1132,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     hipArray *Array =
         std::get<SurfaceMem>(hImage->Mem).getArray(hQueue->getDevice());
 
-    hipArray_Format Format;
-    size_t NumChannels;
+    hipArray_Format Format{};
+    size_t NumChannels{};
     UR_CHECK_ERROR(getArrayDesc(Array, Format, NumChannels));
 
     int ElementByteSize = imageElementByteSize(Format);
@@ -1195,14 +1195,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
 
     hipArray *SrcArray =
         std::get<SurfaceMem>(hImageSrc->Mem).getArray(hQueue->getDevice());
-    hipArray_Format SrcFormat;
-    size_t SrcNumChannels;
+    hipArray_Format SrcFormat{};
+    size_t SrcNumChannels{};
     UR_CHECK_ERROR(getArrayDesc(SrcArray, SrcFormat, SrcNumChannels));
 
     hipArray *DstArray =
         std::get<SurfaceMem>(hImageDst->Mem).getArray(hQueue->getDevice());
-    hipArray_Format DstFormat;
-    size_t DstNumChannels;
+    hipArray_Format DstFormat{};
+    size_t DstNumChannels{};
     UR_CHECK_ERROR(getArrayDesc(DstArray, DstFormat, DstNumChannels));
 
     UR_ASSERT(SrcFormat == DstFormat,
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index e3eb37dc88..66b9fe4403 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -280,7 +280,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
     hKernel->Args.addMemObjArg(argIndex, hArgValue, Properties->memoryAccess);
     if (hArgValue->isImage()) {
       auto array = std::get<SurfaceMem>(hArgValue->Mem).getArray(Device);
-      hipArray_Format Format;
+      hipArray_Format Format{};
       size_t NumChannels;
       UR_CHECK_ERROR(getArrayDesc(array, Format, NumChannels));
       if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&

From 0ebaca6ed1f988f0fc2d8138e1534323a78f9f65 Mon Sep 17 00:00:00 2001
From: "Spruit, Neil R" <neil.r.spruit@intel.com>
Date: Thu, 11 Jan 2024 11:55:25 -0800
Subject: [PATCH 13/19] [L0] Only Override max allocation limits given env

- Change the defaults from always allowing > 4GB allocations to making the user have to request > 4GB
  allocation support given the max allocation allowed on that system is less than 4GB.
- This ensures performance is maintained on systems that dont handle > 4GB allocations natively and
  avoids breaking Ahead of Time (AOT) binaries that were built without > 4GB resource support.
- By setting UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 , the L0 Adapter
  will force the modules to be built with stateless or > 4GB support and
  will allow for the allocations to exceed the max single allocation size limit
  for that device.

Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/device.cpp  | 26 +++++++++++---------------
 source/adapters/level_zero/device.hpp  |  2 +-
 source/adapters/level_zero/program.cpp |  4 ++--
 source/adapters/level_zero/usm.cpp     |  4 ++--
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index abdfd2e541..0b8e12c67a 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -10,6 +10,7 @@
 
 #include "device.hpp"
 #include "ur_level_zero.hpp"
+#include "ur_util.hpp"
 #include <algorithm>
 #include <climits>
 #include <optional>
@@ -268,9 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(uint32_t{64});
   }
   case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
-    // if not optimized for 32-bit access, return total memory size.
-    // otherwise, return only maximum allocatable size.
-    if (Device->useOptimized32bitAccess() == 0) {
+    // if the user wishes to allocate large allocations on a system that usually
+    // does not allow that allocation size, then we return the max global mem
+    // size as the limit.
+    if (Device->useRelaxedAllocationLimits()) {
       return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
     } else {
       return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
@@ -923,20 +925,14 @@ ur_device_handle_t_::useImmediateCommandLists() {
   }
 }
 
-int32_t ur_device_handle_t_::useOptimized32bitAccess() {
-  static const int32_t Optimize32bitAccessMode = [this] {
-    // If device is Intel(R) Data Center GPU Max,
-    // use default provided by L0 driver.
-    // TODO: Use IP versioning to select based on range of devices
-    if (this->isPVC())
-      return -1;
-    const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
-    if (!UrRet)
-      return 0;
-    return std::atoi(UrRet);
+bool ur_device_handle_t_::useRelaxedAllocationLimits() {
+  static const bool EnableRelaxedAllocationLimits = [] {
+    auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS");
+    const bool RetVal = UrRet ? std::stoi(*UrRet) : 0;
+    return RetVal;
   }();
 
-  return Optimize32bitAccessMode;
+  return EnableRelaxedAllocationLimits;
 }
 
 ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 3b91b70058..94480336c5 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -160,7 +160,7 @@ struct ur_device_handle_t_ : _ur_object {
   // provide support for only one, like for Intel(R)
   // Data Center GPU Max, for which L0 driver only
   // supports stateless.
-  int32_t useOptimized32bitAccess();
+  bool useRelaxedAllocationLimits();
 
   bool isSubDevice() { return RootDevice != nullptr; }
 
diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index f118a5b9dd..bb2d964422 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -161,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
     ZeBuildOptions += pOptions;
   }
 
-  if (phDevices[0]->useOptimized32bitAccess() == 0) {
+  if (phDevices[0]->useRelaxedAllocationLimits()) {
     ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
   }
 
@@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
     // ze-opt-greater-than-4GB-buffer-required to disable
     // stateful optimizations and be able to use larger than
     // 4GB allocations on these kernels.
-    if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
+    if (Context->Devices[0]->useRelaxedAllocationLimits()) {
       Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
     }
   }
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index e4a00249a2..11245b5760 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -178,11 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
   ZeDesc.flags = 0;
   ZeDesc.ordinal = 0;
 
-  if (Device->useOptimized32bitAccess() == 0 &&
+  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
+  if (Device->useRelaxedAllocationLimits() &&
       (Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
     // Tell Level-Zero to accept Size > maxMemAllocSize if
     // large allocations are used.
-    ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
     RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
     ZeDesc.pNext = &RelaxedDesc;
   }

From d45f09ea303aaf9b4013298382445849b910319a Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Sat, 25 Nov 2023 12:30:36 +0800
Subject: [PATCH 14/19] [NFC] Add utility function ur_unreachable to simplify
 code

---
 source/adapters/cuda/usm.cpp          | 6 +-----
 source/adapters/hip/usm.cpp           | 7 +------
 source/adapters/native_cpu/kernel.cpp | 7 ++-----
 source/common/ur_util.hpp             | 8 ++++++++
 tools/urtrace/collector.cpp           | 2 +-
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 54c9b68204..e844116312 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -227,11 +227,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
         return ReturnValue(UR_USM_TYPE_HOST);
       }
       // should never get here
-#ifdef _MSC_VER
-      __assume(0);
-#else
-      __builtin_unreachable();
-#endif
+      ur_unreachable();
     }
     case UR_USM_ALLOC_INFO_BASE_PTR: {
 #if CUDA_VERSION >= 10020
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index abd8c2e97f..5d3d576d63 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -190,12 +190,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
         return ReturnValue(UR_USM_TYPE_HOST);
       }
       // should never get here
-#ifdef _MSC_VER
-      __assume(0);
-#else
-      __builtin_unreachable();
-#endif
-      return ReturnValue(UR_USM_TYPE_UNKNOWN);
+      ur_unreachable();
     }
     case UR_USM_ALLOC_INFO_DEVICE: {
       // get device index associated with this pointer
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index 2a7e765e41..a6fc670898 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ur_api.h"
+#include "ur_util.hpp"
 
 #include "common.hpp"
 #include "kernel.hpp"
@@ -172,11 +173,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     return ReturnValue(0);
   }
   case UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32: {
-#ifdef _MSC_VER
-    __assume(0);
-#else
-    __builtin_unreachable();
-#endif
+    ur_unreachable();
   }
   }
   DIE_NO_IMPLEMENTATION;
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index 5a34aa6584..aa67dc764b 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -298,4 +298,12 @@ inline ur_result_t exceptionToResult(std::exception_ptr eptr) {
 
 template <class> inline constexpr bool ur_always_false_t = false;
 
+[[noreturn]] inline void ur_unreachable() {
+#ifdef _MSC_VER
+    __assume(0);
+#else
+    __builtin_unreachable();
+#endif
+}
+
 #endif /* UR_UTIL_H */
diff --git a/tools/urtrace/collector.cpp b/tools/urtrace/collector.cpp
index a78cb82d08..e7872dfbca 100644
--- a/tools/urtrace/collector.cpp
+++ b/tools/urtrace/collector.cpp
@@ -277,7 +277,7 @@ std::unique_ptr<TraceWriter> create_writer() {
     case OUTPUT_JSON:
         return std::make_unique<JsonWriter>();
     default:
-        assert(0); /* unreachable */
+        ur_unreachable();
     }
     return nullptr;
 }

From 5cde53733d093c4bf301c72e0892e71653c1c624 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Mon, 27 Nov 2023 19:28:59 +0800
Subject: [PATCH 15/19] ur_unreachable -> ur::unreachable

---
 source/adapters/cuda/usm.cpp          | 2 +-
 source/adapters/hip/usm.cpp           | 2 +-
 source/adapters/native_cpu/kernel.cpp | 2 +-
 source/common/ur_util.hpp             | 4 +++-
 tools/urtrace/collector.cpp           | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index e844116312..837b4de6c7 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -227,7 +227,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
         return ReturnValue(UR_USM_TYPE_HOST);
       }
       // should never get here
-      ur_unreachable();
+      ur::unreachable();
     }
     case UR_USM_ALLOC_INFO_BASE_PTR: {
 #if CUDA_VERSION >= 10020
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 5d3d576d63..e660c1aee0 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -190,7 +190,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
         return ReturnValue(UR_USM_TYPE_HOST);
       }
       // should never get here
-      ur_unreachable();
+      ur::unreachable();
     }
     case UR_USM_ALLOC_INFO_DEVICE: {
       // get device index associated with this pointer
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index a6fc670898..5a7a286adc 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -173,7 +173,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     return ReturnValue(0);
   }
   case UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32: {
-    ur_unreachable();
+    ur::unreachable();
   }
   }
   DIE_NO_IMPLEMENTATION;
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index aa67dc764b..51688cbe32 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -298,12 +298,14 @@ inline ur_result_t exceptionToResult(std::exception_ptr eptr) {
 
 template <class> inline constexpr bool ur_always_false_t = false;
 
-[[noreturn]] inline void ur_unreachable() {
+namespace ur {
+[[noreturn]] inline void unreachable() {
 #ifdef _MSC_VER
     __assume(0);
 #else
     __builtin_unreachable();
 #endif
 }
+} // namespace ur
 
 #endif /* UR_UTIL_H */
diff --git a/tools/urtrace/collector.cpp b/tools/urtrace/collector.cpp
index e7872dfbca..766e7c9dfe 100644
--- a/tools/urtrace/collector.cpp
+++ b/tools/urtrace/collector.cpp
@@ -277,7 +277,7 @@ std::unique_ptr<TraceWriter> create_writer() {
     case OUTPUT_JSON:
         return std::make_unique<JsonWriter>();
     default:
-        ur_unreachable();
+        ur::unreachable();
     }
     return nullptr;
 }

From 5accf50955ccd807f09024f15ea7fe7d0a008093 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Tue, 28 Nov 2023 08:13:08 +0800
Subject: [PATCH 16/19] include ur_util.hpp in cuda/usm.cpp hip/usm.cpp

---
 source/adapters/cuda/usm.cpp | 1 +
 source/adapters/hip/usm.cpp  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 837b4de6c7..8929fb7fa1 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -17,6 +17,7 @@
 #include "event.hpp"
 #include "platform.hpp"
 #include "queue.hpp"
+#include "ur_util.hpp"
 #include "usm.hpp"
 
 #include <cuda.h>
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index e660c1aee0..9d084f7b4e 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -15,6 +15,7 @@
 #include "context.hpp"
 #include "device.hpp"
 #include "platform.hpp"
+#include "ur_util.hpp"
 #include "usm.hpp"
 
 /// USM: Implements USM Host allocations using HIP Pinned Memory

From fe5c2237425e3482854b2ba07afa533bb589014e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Mon, 15 Jan 2024 15:11:04 +0100
Subject: [PATCH 17/19] [CI] Add Dockerfiles linter

---
 .github/workflows/hadolint.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .github/workflows/hadolint.yml

diff --git a/.github/workflows/hadolint.yml b/.github/workflows/hadolint.yml
new file mode 100644
index 0000000000..1674207f70
--- /dev/null
+++ b/.github/workflows/hadolint.yml
@@ -0,0 +1,32 @@
+# Runs linter for Docker files
+name: Hadolint
+
+on:
+  workflow_dispatch:
+  push:
+  pull_request:
+    paths:
+      - '.github/docker/*Dockerfile'
+      - '.github/workflows/hadolint.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  linux:
+    name: Hadolint
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone the git repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Run Hadolint
+        uses: hadolint/hadolint-action@54c9adbab1582c2ef04b2016b760714a4bfde3cf # v3.1.0
+        with:
+          recursive: true
+          dockerfile:  ".github/docker/*Dockerfile"

From d08237c69e877a3615b4d80c096f8e8efd8a446a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Mon, 15 Jan 2024 16:31:32 +0100
Subject: [PATCH 18/19] [CI] Fix linter issues in Ubuntu Dockerfile

---
 .github/docker/ubuntu-22.04.Dockerfile | 17 ++++++++++-------
 .github/workflows/hadolint.yml         |  2 ++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile
index 38161f5b6e..e8e88caa72 100644
--- a/.github/docker/ubuntu-22.04.Dockerfile
+++ b/.github/docker/ubuntu-22.04.Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -51,20 +51,23 @@ RUN apt-get update \
 	${BASE_DEPS} \
 	${UR_DEPS} \
 	${MISC_DEPS} \
+ && rm -rf /var/lib/apt/lists/* \
  && apt-get clean all
 
-RUN pip3 install ${UR_PYTHON_DEPS}
+# pip package is pinned to a version, but it's probably improperly parsed here
+# hadolint ignore=DL3013
+RUN pip3 install --no-cache-dir ${UR_PYTHON_DEPS}
 
 # Install DPC++
-COPY install_dpcpp.sh install_dpcpp.sh
+COPY install_dpcpp.sh /opt/install_dpcpp.sh
 ENV DPCPP_PATH=/opt/dpcpp
-RUN ./install_dpcpp.sh
+RUN /opt/install_dpcpp.sh
 
 # Install libbacktrace
-COPY install_libbacktrace.sh install_libbacktrace.sh
-RUN ./install_libbacktrace.sh
+COPY install_libbacktrace.sh /opt/install_libbacktrace.sh
+RUN /opt/install_libbacktrace.sh
 
 # Add a new (non-root) 'user'
 ENV USER user
 ENV USERPASS pass
-RUN useradd -m $USER -g sudo -p `mkpasswd $USERPASS`
+RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})"
diff --git a/.github/workflows/hadolint.yml b/.github/workflows/hadolint.yml
index 1674207f70..a1063aefe8 100644
--- a/.github/workflows/hadolint.yml
+++ b/.github/workflows/hadolint.yml
@@ -30,3 +30,5 @@ jobs:
         with:
           recursive: true
           dockerfile:  ".github/docker/*Dockerfile"
+          # ignore pinning apt packages to versions
+          ignore: DL3008

From b51dec07139ccaf76765c83936a84bd75f3a54dc Mon Sep 17 00:00:00 2001
From: Yang Zhao <yang2.zhao@intel.com>
Date: Wed, 17 Jan 2024 16:40:51 +0800
Subject: [PATCH 19/19] [UR][Layer] Add Sanitizer Layer (#1074)

---
 .gitignore                                    |   4 +
 CMakeLists.txt                                |   5 +
 README.md                                     |   1 +
 scripts/core/INTRO.rst                        |  15 +
 source/common/logger/ur_logger.hpp            |   5 +
 source/common/logger/ur_logger_details.hpp    |   8 +
 source/common/logger/ur_sinks.hpp             |   2 +-
 source/loader/CMakeLists.txt                  |  17 +
 .../layers/sanitizer/asan_interceptor.cpp     | 605 ++++++++++++++++++
 .../layers/sanitizer/asan_interceptor.hpp     | 141 ++++
 source/loader/layers/sanitizer/common.hpp     |  90 +++
 .../sanitizer/device_sanitizer_report.hpp     |  95 +++
 source/loader/layers/sanitizer/ur_sanddi.cpp  | 485 ++++++++++++++
 .../layers/sanitizer/ur_sanitizer_layer.cpp   |  30 +
 .../layers/sanitizer/ur_sanitizer_layer.hpp   |  55 ++
 source/loader/ur_lib.hpp                      |   8 +-
 16 files changed, 1564 insertions(+), 2 deletions(-)
 create mode 100644 source/loader/layers/sanitizer/asan_interceptor.cpp
 create mode 100644 source/loader/layers/sanitizer/asan_interceptor.hpp
 create mode 100644 source/loader/layers/sanitizer/common.hpp
 create mode 100644 source/loader/layers/sanitizer/device_sanitizer_report.hpp
 create mode 100644 source/loader/layers/sanitizer/ur_sanddi.cpp
 create mode 100644 source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
 create mode 100644 source/loader/layers/sanitizer/ur_sanitizer_layer.hpp

diff --git a/.gitignore b/.gitignore
index 85770fe15c..89736ad22a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,7 @@ out/
 
 # External content
 */**/external
+
+# VS clangd
+/.cache
+/compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcdf90f173..20e8da9d6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ option(UR_USE_UBSAN "enable UndefinedBehaviorSanitizer" OFF)
 option(UR_USE_MSAN "enable MemorySanitizer" OFF)
 option(UR_USE_TSAN "enable ThreadSanitizer" OFF)
 option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF)
+option(UR_ENABLE_SANITIZER "enable device sanitizer" ON)
 option(UMF_BUILD_SHARED_LIBRARY "Build UMF as shared library" OFF)
 option(UMF_ENABLE_POOL_TRACKING "Build UMF with pool tracking" ON)
 option(UR_BUILD_ADAPTER_L0 "Build the Level-Zero adapter" OFF)
@@ -121,6 +122,10 @@ if(UR_ENABLE_TRACING)
     endif()
 endif()
 
+if(UR_ENABLE_SANITIZER)
+    add_compile_definitions(UR_ENABLE_SANITIZER)
+endif()
+
 if(UR_USE_ASAN)
     add_sanitizer_flag(address)
 endif()
diff --git a/README.md b/README.md
index 57536f237a..8bc58a92a2 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,7 @@ List of options provided by CMake:
 | UR_USE_UBSAN | Enable UndefinedBehavior Sanitizer | ON/OFF | OFF |
 | UR_USE_MSAN | Enable MemorySanitizer (clang only) | ON/OFF | OFF |
 | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF |
+| UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON |
 | UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 |
 | UR_BUILD_ADAPTER_L0     | Build the Level-Zero adapter            | ON/OFF     | OFF     |
 | UR_BUILD_ADAPTER_OPENCL | Build the OpenCL adapter                | ON/OFF     | OFF     |
diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst
index d557193ef0..ae94301d55 100644
--- a/scripts/core/INTRO.rst
+++ b/scripts/core/INTRO.rst
@@ -179,6 +179,15 @@ Unified Runtime loader implements tracing support through the `XPTI framework <h
        | **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, arguments, and return value.
      - None
 
+Sanitizers
+---------------------
+
+Unified Runtime loader implements the runtime part of device-side sanitizers: AddressSanitizer (`UR_LAYER_ASAN`), MemorySanitizer (`UR_LAYER_MSAN`, planned), and ThreadSanitizer (`UR_LAYER_TSAN`, planned).
+
+This layer shouldn't be enabled explicitly, for example, by the environment variable `UR_ENABLE_LAYERS`, but is enabled by program's runtime (e.g. SYCL/OpenMP Runtime) when the device code is compiled with flag `-fsanitize=address|memory|thread`.
+
+Currently, AddressSanitizer only supports some of the devices on OpenCL and Level-Zero adapters, and this could be extended to support other devices and adapters if UR virtual memory APIs and shadow memory mapping in libdevice are supported.
+
 Logging
 ---------------------
 
@@ -260,6 +269,8 @@ Layers currently included with the runtime are as follows:
      - Enables UR_LAYER_PARAMETER_VALIDATION and UR_LAYER_LEAK_CHECKING.
    * - UR_LAYER_TRACING
      - Enables the XPTI tracing layer, see Tracing_ for more detail.
+   * - UR_LAYER_ASAN \| UR_LAYER_MSAN \| UR_LAYER_TSAN
+     - Enables the device-side sanitizer layer, see Sanitizers_ for more detail.
 
 Environment Variables
 ---------------------
@@ -274,6 +285,10 @@ Specific environment variables can be set to control the behavior of unified run
 
    Holds parameters for setting Unified Runtime null adapter logging. The syntax is described in the Logging_ section.
 
+.. envvar:: UR_LOG_SANITIZER
+
+   Holds parameters for setting Unified Runtime sanitizer logging. The syntax is described in the Logging_ section.
+
 .. envvar:: UR_LOG_VALIDATION
 
    Holds parameters for setting Unified Runtime validation logging. The syntax is described in the Logging_ section.
diff --git a/source/common/logger/ur_logger.hpp b/source/common/logger/ur_logger.hpp
index 0575660fb6..27fcca4274 100644
--- a/source/common/logger/ur_logger.hpp
+++ b/source/common/logger/ur_logger.hpp
@@ -43,6 +43,11 @@ inline void error(const char *format, Args &&...args) {
     get_logger().log(logger::Level::ERR, format, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+inline void always(const char *format, Args &&...args) {
+    get_logger().always(format, std::forward<Args>(args)...);
+}
+
 inline void setLevel(logger::Level level) { get_logger().setLevel(level); }
 
 inline void setFlushLevel(logger::Level level) {
diff --git a/source/common/logger/ur_logger_details.hpp b/source/common/logger/ur_logger_details.hpp
index 4759a2fd24..6ff279ad1a 100644
--- a/source/common/logger/ur_logger_details.hpp
+++ b/source/common/logger/ur_logger_details.hpp
@@ -51,6 +51,14 @@ class Logger {
         log(logger::Level::ERR, format, std::forward<Args>(args)...);
     }
 
+    template <typename... Args>
+    void always(const char *format, Args &&...args) {
+        if (sink) {
+            sink->log(logger::Level::QUIET, format,
+                      std::forward<Args>(args)...);
+        }
+    }
+
     template <typename... Args>
     void log(logger::Level level, const char *format, Args &&...args) {
         if (level < this->level) {
diff --git a/source/common/logger/ur_sinks.hpp b/source/common/logger/ur_sinks.hpp
index cb8c751e4d..db30f3c8ed 100644
--- a/source/common/logger/ur_sinks.hpp
+++ b/source/common/logger/ur_sinks.hpp
@@ -22,7 +22,7 @@ class Sink {
     template <typename... Args>
     void log(logger::Level level, const char *fmt, Args &&...args) {
         std::ostringstream buffer;
-        if (!skip_prefix) {
+        if (!skip_prefix && level != logger::Level::QUIET) {
             buffer << "<" << logger_name << ">"
                    << "[" << level_to_str(level) << "]: ";
         }
diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index d4f5bc73a5..b0c8bbcb86 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -101,6 +101,23 @@ if(UR_ENABLE_TRACING)
     )
 endif()
 
+if(UR_ENABLE_SANITIZER)
+    target_sources(ur_loader
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../ur/ur.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/common.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/device_sanitizer_report.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanitizer_layer.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanitizer_layer.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanddi.cpp
+    )
+    target_include_directories(ur_loader PRIVATE
+        "${CMAKE_CURRENT_SOURCE_DIR}/../"
+    )
+endif()
+
 
 # link validation backtrace dependencies
 if(UNIX)
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
new file mode 100644
index 0000000000..394405c056
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -0,0 +1,605 @@
+//===----------------------------------------------------------------------===//
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_interceptor.cpp
+ *
+ */
+
+#include "asan_interceptor.hpp"
+#include "device_sanitizer_report.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+
+namespace {
+
+// These magic values are written to shadow for better error
+// reporting.
+constexpr int kUsmDeviceRedzoneMagic = (char)0x81;
+constexpr int kUsmHostRedzoneMagic = (char)0x82;
+constexpr int kUsmSharedRedzoneMagic = (char)0x83;
+constexpr int kMemBufferRedzoneMagic = (char)0x84;
+
+constexpr auto kSPIR_AsanShadowMemoryGlobalStart =
+    "__AsanShadowMemoryGlobalStart";
+constexpr auto kSPIR_AsanShadowMemoryGlobalEnd = "__AsanShadowMemoryGlobalEnd";
+
+constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem";
+
+DeviceSanitizerReport SPIR_DeviceSanitizerReportMem;
+
+uptr MemToShadow_PVC(uptr USM_SHADOW_BASE, uptr UPtr) {
+    if (UPtr & 0xFF00000000000000ULL) { // Device USM
+        return USM_SHADOW_BASE + 0x200000000000ULL +
+               ((UPtr & 0xFFFFFFFFFFFFULL) >> 3);
+    } else { // Only consider 47bit VA
+        return USM_SHADOW_BASE + ((UPtr & 0x7FFFFFFFFFFFULL) >> 3);
+    }
+}
+
+ur_context_handle_t getContext(ur_queue_handle_t Queue) {
+    ur_context_handle_t Context;
+    [[maybe_unused]] auto Result = context.urDdiTable.Queue.pfnGetInfo(
+        Queue, UR_QUEUE_INFO_CONTEXT, sizeof(ur_context_handle_t), &Context,
+        nullptr);
+    assert(Result == UR_RESULT_SUCCESS);
+    return Context;
+}
+
+ur_device_handle_t getDevice(ur_queue_handle_t Queue) {
+    ur_device_handle_t Device;
+    [[maybe_unused]] auto Result = context.urDdiTable.Queue.pfnGetInfo(
+        Queue, UR_QUEUE_INFO_DEVICE, sizeof(ur_device_handle_t), &Device,
+        nullptr);
+    assert(Result == UR_RESULT_SUCCESS);
+    return Device;
+}
+
+ur_program_handle_t getProgram(ur_kernel_handle_t Kernel) {
+    ur_program_handle_t Program;
+    [[maybe_unused]] auto Result = context.urDdiTable.Kernel.pfnGetInfo(
+        Kernel, UR_KERNEL_INFO_PROGRAM, sizeof(ur_program_handle_t), &Program,
+        nullptr);
+    assert(Result == UR_RESULT_SUCCESS);
+    return Program;
+}
+
+} // namespace
+
+/// The memory chunk allocated from the underlying allocator looks like this:
+/// L L L L L L U U U U U U R R
+///   L -- left redzone words (0 or more bytes)
+///   U -- user memory.
+///   R -- right redzone (0 or more bytes)
+///
+/// ref: "compiler-rt/lib/asan/asan_allocator.cpp" Allocator::Allocate
+ur_result_t SanitizerInterceptor::allocateMemory(
+    ur_context_handle_t Context, ur_device_handle_t Device,
+    const ur_usm_desc_t *Properties, ur_usm_pool_handle_t Pool, size_t Size,
+    void **ResultPtr, USMMemoryType Type) {
+    auto Alignment = Properties->align;
+    assert(Alignment == 0 || IsPowerOfTwo(Alignment));
+
+    auto ContextInfo = getContextInfo(Context);
+    std::shared_ptr<DeviceInfo> DeviceInfo;
+    if (Device) {
+        DeviceInfo = ContextInfo->getDeviceInfo(Device);
+    }
+
+    if (Alignment == 0) {
+        Alignment =
+            DeviceInfo ? DeviceInfo->Alignment : ASAN_SHADOW_GRANULARITY;
+    }
+
+    // Copy from LLVM compiler-rt/lib/asan
+    uptr RZLog = ComputeRZLog(Size);
+    uptr RZSize = RZLog2Size(RZLog);
+    uptr RoundedSize = RoundUpTo(Size, Alignment);
+    uptr NeededSize = RoundedSize + RZSize * 2;
+
+    void *Allocated = nullptr;
+
+    if (Type == USMMemoryType::DEVICE) {
+        UR_CALL(context.urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, Properties, Pool, NeededSize, &Allocated));
+    } else if (Type == USMMemoryType::HOST) {
+        UR_CALL(context.urDdiTable.USM.pfnHostAlloc(Context, Properties, Pool,
+                                                    NeededSize, &Allocated));
+    } else if (Type == USMMemoryType::SHARE) {
+        UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
+            Context, Device, Properties, Pool, NeededSize, &Allocated));
+    } else {
+        context.logger.error("Unsupport memory type");
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+
+    // Copy from LLVM compiler-rt/lib/asan
+    uptr AllocBegin = reinterpret_cast<uptr>(Allocated);
+    [[maybe_unused]] uptr AllocEnd = AllocBegin + NeededSize;
+    uptr UserBegin = AllocBegin + RZSize;
+    if (!IsAligned(UserBegin, Alignment)) {
+        UserBegin = RoundUpTo(UserBegin, Alignment);
+    }
+    uptr UserEnd = UserBegin + Size;
+    assert(UserEnd <= AllocEnd);
+
+    *ResultPtr = reinterpret_cast<void *>(UserBegin);
+
+    auto AllocInfo = std::make_shared<USMAllocInfo>(
+        USMAllocInfo{AllocBegin, UserBegin, UserEnd, NeededSize, Type});
+
+    // For updating shadow memory
+    if (DeviceInfo) { // device/shared USM
+        std::scoped_lock<ur_shared_mutex> Guard(DeviceInfo->Mutex);
+        DeviceInfo->AllocInfos.emplace_back(AllocInfo);
+    } else { // host USM's AllocInfo needs to insert into all devices
+        for (auto &pair : ContextInfo->DeviceMap) {
+            auto DeviceInfo = pair.second;
+            std::scoped_lock<ur_shared_mutex> Guard(DeviceInfo->Mutex);
+            DeviceInfo->AllocInfos.emplace_back(AllocInfo);
+        }
+    }
+
+    // For memory release
+    {
+        std::scoped_lock<ur_shared_mutex> Guard(ContextInfo->Mutex);
+        ContextInfo->AllocatedUSMMap[AllocBegin] = AllocInfo;
+    }
+
+    context.logger.info(
+        "AllocInfos(AllocBegin={},  User={}-{}, NeededSize={}, Type={})",
+        (void *)AllocBegin, (void *)UserBegin, (void *)UserEnd, NeededSize,
+        Type);
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context,
+                                                void *Ptr) {
+    auto ContextInfo = getContextInfo(Context);
+
+    std::shared_lock<ur_shared_mutex> Guard(ContextInfo->Mutex);
+
+    auto Addr = reinterpret_cast<uptr>(Ptr);
+    // Find the last element is not greater than key
+    auto AllocInfoIt = ContextInfo->AllocatedUSMMap.upper_bound((uptr)Addr);
+    if (AllocInfoIt == ContextInfo->AllocatedUSMMap.begin()) {
+        context.logger.error(
+            "Can't find release pointer({}) in AllocatedAddressesMap", Ptr);
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    --AllocInfoIt;
+    auto &AllocInfo = AllocInfoIt->second;
+
+    context.logger.debug("USMAllocInfo(AllocBegin={}, UserBegin={})",
+                         AllocInfo->AllocBegin, AllocInfo->UserBegin);
+
+    if (Addr != AllocInfo->UserBegin) {
+        context.logger.error("Releasing pointer({}) is not match to {}", Ptr,
+                             AllocInfo->UserBegin);
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+
+    // TODO: Update shadow memory
+    return context.urDdiTable.USM.pfnFree(Context,
+                                          (void *)AllocInfo->AllocBegin);
+}
+
+bool SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
+                                           ur_queue_handle_t Queue,
+                                           ur_event_handle_t &Event) {
+    prepareLaunch(Queue, Kernel);
+
+    UR_CALL(updateShadowMemory(Queue));
+
+    // Return LastEvent in QueueInfo
+    auto Context = getContext(Queue);
+    auto ContextInfo = getContextInfo(Context);
+    auto QueueInfo = ContextInfo->getQueueInfo(Queue);
+
+    std::scoped_lock<ur_mutex> Guard(QueueInfo->Mutex);
+    Event = QueueInfo->LastEvent;
+    QueueInfo->LastEvent = nullptr;
+
+    return true;
+}
+
+void SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
+                                            ur_queue_handle_t Queue,
+                                            ur_event_handle_t &Event) {
+    auto Program = getProgram(Kernel);
+    ur_event_handle_t ReadEvent{};
+
+    // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it
+    // to host, but it's okay that it isn't defined
+    auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead(
+        Queue, Program, kSPIR_DeviceSanitizerReportMem, true,
+        sizeof(SPIR_DeviceSanitizerReportMem), 0,
+        &SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent);
+
+    if (Result == UR_RESULT_SUCCESS) {
+        Event = ReadEvent;
+
+        auto AH = &SPIR_DeviceSanitizerReportMem;
+        if (!AH->Flag) {
+            return;
+        }
+
+        const char *File = AH->File[0] ? AH->File : "<unknown file>";
+        const char *Func = AH->Func[0] ? AH->Func : "<unknown func>";
+
+        context.logger.always("\n====ERROR: DeviceSanitizer: {} on {}",
+                              DeviceSanitizerFormat(AH->ErrorType),
+                              DeviceSanitizerFormat(AH->MemoryType));
+        context.logger.always(
+            "{} of size {} at kernel <{}> LID({}, {}, {}) GID({}, "
+            "{}, {})",
+            AH->IsWrite ? "WRITE" : "READ", AH->AccessSize, Func, AH->LID0,
+            AH->LID1, AH->LID2, AH->GID0, AH->GID1, AH->GID2);
+        context.logger.always("  #0 {} {}:{}", Func, File, AH->Line);
+        if (!AH->IsRecover) {
+            abort();
+        }
+    }
+}
+
+std::string SanitizerInterceptor::getKernelName(ur_kernel_handle_t Kernel) {
+    size_t KernelNameSize = 0;
+    [[maybe_unused]] auto Res = context.urDdiTable.Kernel.pfnGetInfo(
+        Kernel, UR_KERNEL_INFO_FUNCTION_NAME, 0, nullptr, &KernelNameSize);
+    assert(Res == UR_RESULT_SUCCESS);
+
+    std::vector<char> KernelNameBuf(KernelNameSize);
+    Res = context.urDdiTable.Kernel.pfnGetInfo(
+        Kernel, UR_KERNEL_INFO_FUNCTION_NAME, KernelNameSize,
+        KernelNameBuf.data(), nullptr);
+    assert(Res == UR_RESULT_SUCCESS);
+
+    return std::string(KernelNameBuf.data(), KernelNameSize - 1);
+}
+
+ur_result_t SanitizerInterceptor::allocShadowMemory(
+    ur_context_handle_t Context, std::shared_ptr<DeviceInfo> &DeviceInfo) {
+    if (DeviceInfo->Type == DeviceType::CPU) {
+        context.logger.error("Unsupport device type");
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    } else if (DeviceInfo->Type == DeviceType::GPU_PVC) {
+        /// SHADOW MEMORY MAPPING (PVC, with CPU 47bit)
+        ///   Host/Shared USM : 0x0              ~ 0x0fff_ffff_ffff
+        ///   ?               : 0x1000_0000_0000 ~ 0x1fff_ffff_ffff
+        ///   Device USM      : 0x2000_0000_0000 ~ 0x3fff_ffff_ffff
+        constexpr size_t SHADOW_SIZE = 1ULL << 46;
+
+        // TODO: Protect Bad Zone
+        auto Result = context.urDdiTable.VirtualMem.pfnReserve(
+            Context, nullptr, SHADOW_SIZE, (void **)&DeviceInfo->ShadowOffset);
+        if (Result != UR_RESULT_SUCCESS) {
+            context.logger.error("Failed to allocate shadow memory on PVC: {}",
+                                 Result);
+            return Result;
+        }
+
+        DeviceInfo->ShadowOffsetEnd = DeviceInfo->ShadowOffset + SHADOW_SIZE;
+    } else {
+        context.logger.error("Unsupport device type");
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    context.logger.info("ShadowMemory(Global): {} - {}",
+                        (void *)DeviceInfo->ShadowOffset,
+                        (void *)DeviceInfo->ShadowOffsetEnd);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::enqueueMemSetShadow(
+    ur_context_handle_t Context, ur_device_handle_t Device,
+    ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value,
+    ur_event_handle_t DepEvent, ur_event_handle_t *OutEvent) {
+
+    uint32_t NumEventsInWaitList = DepEvent ? 1 : 0;
+    const ur_event_handle_t *EventsWaitList = DepEvent ? &DepEvent : nullptr;
+
+    auto ContextInfo = getContextInfo(Context);
+    auto DeviceInfo = ContextInfo->getDeviceInfo(Device);
+
+    if (DeviceInfo->Type == DeviceType::CPU) {
+        context.logger.error("Unsupport device type");
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    } else if (DeviceInfo->Type == DeviceType::GPU_PVC) {
+        ur_event_handle_t InternalEvent{};
+        ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+
+        uptr ShadowBegin = MemToShadow_PVC(DeviceInfo->ShadowOffset, Ptr);
+        uptr ShadowEnd =
+            MemToShadow_PVC(DeviceInfo->ShadowOffset, Ptr + Size - 1);
+
+        {
+            static const size_t PageSize = [Context, Device]() {
+                size_t Size;
+                [[maybe_unused]] auto Result =
+                    context.urDdiTable.VirtualMem.pfnGranularityGetInfo(
+                        Context, Device,
+                        UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED,
+                        sizeof(Size), &Size, nullptr);
+                assert(Result == UR_RESULT_SUCCESS);
+                context.logger.info("PVC PageSize: {}", Size);
+                return Size;
+            }();
+
+            ur_physical_mem_properties_t Desc{
+                UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES, nullptr, 0};
+            static ur_physical_mem_handle_t PhysicalMem{};
+
+            // Make sure [Ptr, Ptr + Size] is mapped to physical memory
+            for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
+                 MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
+                if (!PhysicalMem) {
+                    auto URes = context.urDdiTable.PhysicalMem.pfnCreate(
+                        Context, Device, PageSize, &Desc, &PhysicalMem);
+                    if (URes != UR_RESULT_SUCCESS) {
+                        context.logger.error("urPhysicalMemCreate(): {}", URes);
+                        return URes;
+                    }
+                }
+
+                context.logger.debug("urVirtualMemMap: {} ~ {}",
+                                     (void *)MappedPtr,
+                                     (void *)(MappedPtr + PageSize - 1));
+
+                // FIXME: No flag to check the failed reason is VA is already mapped
+                auto URes = context.urDdiTable.VirtualMem.pfnMap(
+                    Context, (void *)MappedPtr, PageSize, PhysicalMem, 0,
+                    UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE);
+                if (URes != UR_RESULT_SUCCESS) {
+                    context.logger.debug("urVirtualMemMap(): {}", URes);
+                }
+
+                // Initialize to zero
+                if (URes == UR_RESULT_SUCCESS) {
+                    // Reset PhysicalMem to null since it's been mapped
+                    PhysicalMem = nullptr;
+
+                    const char Pattern[] = {0};
+
+                    auto URes = context.urDdiTable.Enqueue.pfnUSMFill(
+                        Queue, (void *)MappedPtr, 1, Pattern, PageSize,
+                        NumEventsInWaitList, EventsWaitList, Event);
+                    if (URes != UR_RESULT_SUCCESS) {
+                        context.logger.error("urEnqueueUSMFill(): {}", URes);
+                        return URes;
+                    }
+
+                    NumEventsInWaitList = 1;
+                    EventsWaitList = Event;
+                }
+            }
+        }
+
+        const char Pattern[] = {(char)Value};
+        auto URes = context.urDdiTable.Enqueue.pfnUSMFill(
+            Queue, (void *)ShadowBegin, 1, Pattern,
+            (ShadowEnd - ShadowBegin + 1), NumEventsInWaitList, EventsWaitList,
+            Event);
+        if (URes != UR_RESULT_SUCCESS) {
+            context.logger.error("urEnqueueUSMFill(): {}", URes);
+            return URes;
+        }
+    } else {
+        context.logger.error("Unsupport device type");
+        return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    return UR_RESULT_SUCCESS;
+}
+
+/// Each 8 bytes of application memory are mapped into one byte of shadow memory
+/// The meaning of that byte:
+///  - Negative: All bytes are not accessible (poisoned)
+///  - 0: All bytes are accessible
+///  - 1 <= k <= 7: Only the first k bytes is accessible
+///
+/// ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#mapping
+ur_result_t SanitizerInterceptor::enqueueAllocInfo(
+    ur_context_handle_t Context, ur_device_handle_t Device,
+    ur_queue_handle_t Queue, std::shared_ptr<USMAllocInfo> &AllocInfo,
+    ur_event_handle_t &LastEvent) {
+    // Init zero
+    UR_CALL(enqueueMemSetShadow(Context, Device, Queue, AllocInfo->AllocBegin,
+                                AllocInfo->AllocSize, 0, LastEvent,
+                                &LastEvent));
+
+    uptr TailBegin = RoundUpTo(AllocInfo->UserEnd, ASAN_SHADOW_GRANULARITY);
+    uptr TailEnd = AllocInfo->AllocBegin + AllocInfo->AllocSize;
+
+    // User tail
+    if (TailBegin != AllocInfo->UserEnd) {
+        auto Value = AllocInfo->UserEnd -
+                     RoundDownTo(AllocInfo->UserEnd, ASAN_SHADOW_GRANULARITY);
+        UR_CALL(enqueueMemSetShadow(Context, Device, Queue, AllocInfo->UserEnd,
+                                    1, static_cast<u8>(Value), LastEvent,
+                                    &LastEvent));
+    }
+
+    int ShadowByte;
+    switch (AllocInfo->Type) {
+    case USMMemoryType::HOST:
+        ShadowByte = kUsmHostRedzoneMagic;
+        break;
+    case USMMemoryType::DEVICE:
+        ShadowByte = kUsmDeviceRedzoneMagic;
+        break;
+    case USMMemoryType::SHARE:
+        ShadowByte = kUsmSharedRedzoneMagic;
+        break;
+    case USMMemoryType::MEM_BUFFER:
+        ShadowByte = kMemBufferRedzoneMagic;
+        break;
+    default:
+        ShadowByte = 0xff;
+        assert(false && "Unknow AllocInfo Type");
+    }
+
+    // Left red zone
+    UR_CALL(enqueueMemSetShadow(Context, Device, Queue, AllocInfo->AllocBegin,
+                                AllocInfo->UserBegin - AllocInfo->AllocBegin,
+                                ShadowByte, LastEvent, &LastEvent));
+
+    // Right red zone
+    UR_CALL(enqueueMemSetShadow(Context, Device, Queue, TailBegin,
+                                TailEnd - TailBegin, ShadowByte, LastEvent,
+                                &LastEvent));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::updateShadowMemory(ur_queue_handle_t Queue) {
+    auto Context = getContext(Queue);
+    auto Device = getDevice(Queue);
+    assert(Device != nullptr);
+
+    auto ContextInfo = getContextInfo(Context);
+
+    auto HostInfo = ContextInfo->getDeviceInfo(nullptr);
+    auto DeviceInfo = ContextInfo->getDeviceInfo(Device);
+    auto QueueInfo = ContextInfo->getQueueInfo(Queue);
+
+    std::shared_lock<ur_shared_mutex> HostGuard(HostInfo->Mutex,
+                                                std::defer_lock);
+    std::unique_lock<ur_shared_mutex> DeviceGuard(DeviceInfo->Mutex,
+                                                  std::defer_lock);
+    std::scoped_lock<std::shared_lock<ur_shared_mutex>,
+                     std::unique_lock<ur_shared_mutex>, ur_mutex>
+        Guard(HostGuard, DeviceGuard, QueueInfo->Mutex);
+
+    ur_event_handle_t LastEvent = QueueInfo->LastEvent;
+
+    for (auto &AllocInfo : DeviceInfo->AllocInfos) {
+        UR_CALL(enqueueAllocInfo(Context, Device, Queue, AllocInfo, LastEvent));
+    }
+    DeviceInfo->AllocInfos.clear();
+
+    QueueInfo->LastEvent = LastEvent;
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::insertContext(ur_context_handle_t Context) {
+    auto ContextInfo = std::make_shared<ur_sanitizer_layer::ContextInfo>();
+
+    std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+    assert(m_ContextMap.find(Context) == m_ContextMap.end());
+    m_ContextMap.emplace(Context, std::move(ContextInfo));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseContext(ur_context_handle_t Context) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+    assert(m_ContextMap.find(Context) != m_ContextMap.end());
+    m_ContextMap.erase(Context);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::insertDevice(ur_context_handle_t Context,
+                                               ur_device_handle_t Device) {
+    auto DeviceInfo = std::make_shared<ur_sanitizer_layer::DeviceInfo>();
+
+    // Query device type
+    ur_device_type_t DeviceType;
+    UR_CALL(context.urDdiTable.Device.pfnGetInfo(
+        Device, UR_DEVICE_INFO_TYPE, sizeof(DeviceType), &DeviceType, nullptr));
+    switch (DeviceType) {
+    case UR_DEVICE_TYPE_CPU:
+        DeviceInfo->Type = DeviceType::CPU;
+        break;
+    case UR_DEVICE_TYPE_GPU:
+        DeviceInfo->Type = DeviceType::GPU_PVC;
+        break;
+    default:
+        DeviceInfo->Type = DeviceType::UNKNOWN;
+    }
+
+    // Query alignment
+    UR_CALL(context.urDdiTable.Device.pfnGetInfo(
+        Device, UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN,
+        sizeof(DeviceInfo->Alignment), &DeviceInfo->Alignment, nullptr));
+
+    // Allocate shadow memory
+    UR_CALL(allocShadowMemory(Context, DeviceInfo));
+
+    auto ContextInfo = getContextInfo(Context);
+    std::scoped_lock<ur_shared_mutex> Guard(ContextInfo->Mutex);
+    ContextInfo->DeviceMap.emplace(Device, std::move(DeviceInfo));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::insertQueue(ur_context_handle_t Context,
+                                              ur_queue_handle_t Queue) {
+    auto QueueInfo = std::make_shared<ur_sanitizer_layer::QueueInfo>();
+    QueueInfo->LastEvent = nullptr;
+
+    auto ContextInfo = getContextInfo(Context);
+    std::scoped_lock<ur_shared_mutex> Guard(ContextInfo->Mutex);
+    ContextInfo->QueueMap.emplace(Queue, std::move(QueueInfo));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseQueue(ur_context_handle_t Context,
+                                             ur_queue_handle_t Queue) {
+    auto ContextInfo = getContextInfo(Context);
+    std::scoped_lock<ur_shared_mutex> Guard(ContextInfo->Mutex);
+    assert(ContextInfo->QueueMap.find(Queue) != ContextInfo->QueueMap.end());
+    ContextInfo->QueueMap.erase(Queue);
+    return UR_RESULT_SUCCESS;
+}
+
+void SanitizerInterceptor::prepareLaunch(ur_queue_handle_t Queue,
+                                         ur_kernel_handle_t Kernel) {
+    auto Context = getContext(Queue);
+    auto Device = getDevice(Queue);
+    auto Program = getProgram(Kernel);
+
+    auto ContextInfo = getContextInfo(Context);
+    auto DeviceInfo = ContextInfo->getDeviceInfo(Device);
+    auto QueueInfo = ContextInfo->getQueueInfo(Queue);
+
+    std::scoped_lock<ur_mutex> Guard(QueueInfo->Mutex);
+    ur_event_handle_t LastEvent = QueueInfo->LastEvent;
+
+    {
+        // Set global variable to program
+        auto EnqueueWriteGlobal = [&](const char *Name, const void *Value) {
+            ur_event_handle_t NewEvent{};
+            uint32_t NumEvents = LastEvent ? 1 : 0;
+            const ur_event_handle_t *EventsList =
+                LastEvent ? &LastEvent : nullptr;
+            auto Result =
+                context.urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite(
+                    Queue, Program, Name, false, sizeof(uptr), 0, Value,
+                    NumEvents, EventsList, &NewEvent);
+            if (Result != UR_RESULT_SUCCESS) {
+                context.logger.warning("Device Global[{}] Write Failed: {}",
+                                       Name, Result);
+                return false;
+            }
+            LastEvent = NewEvent;
+            return true;
+        };
+
+        // Device shadow memory offset
+        EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalStart,
+                           &DeviceInfo->ShadowOffset);
+        EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalEnd,
+                           &DeviceInfo->ShadowOffsetEnd);
+    }
+
+    QueueInfo->LastEvent = LastEvent;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
new file mode 100644
index 0000000000..edad3f926e
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ur_sanitizer_layer.cpp
+ *
+ */
+
+#pragma once
+
+#include "common.hpp"
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace ur_sanitizer_layer {
+
+enum USMMemoryType { DEVICE, SHARE, HOST, MEM_BUFFER };
+
+struct USMAllocInfo {
+    uptr AllocBegin;
+    uptr UserBegin;
+    uptr UserEnd;
+    size_t AllocSize;
+    USMMemoryType Type;
+};
+
+enum class DeviceType { UNKNOWN, CPU, GPU_PVC, GPU_DG2 };
+
+struct DeviceInfo {
+    DeviceType Type;
+    size_t Alignment;
+    uptr ShadowOffset;
+    uptr ShadowOffsetEnd;
+
+    // Lock InitPool & AllocInfos
+    ur_shared_mutex Mutex;
+    std::vector<std::shared_ptr<USMAllocInfo>> AllocInfos;
+};
+
+struct QueueInfo {
+    ur_mutex Mutex;
+    ur_event_handle_t LastEvent;
+};
+
+struct ContextInfo {
+
+    std::shared_ptr<DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
+        std::shared_lock<ur_shared_mutex> Guard(Mutex);
+        assert(DeviceMap.find(Device) != DeviceMap.end());
+        return DeviceMap[Device];
+    }
+
+    std::shared_ptr<QueueInfo> getQueueInfo(ur_queue_handle_t Queue) {
+        std::shared_lock<ur_shared_mutex> Guard(Mutex);
+        assert(QueueMap.find(Queue) != QueueMap.end());
+        return QueueMap[Queue];
+    }
+
+    std::shared_ptr<USMAllocInfo> getUSMAllocInfo(uptr Address) {
+        std::shared_lock<ur_shared_mutex> Guard(Mutex);
+        assert(AllocatedUSMMap.find(Address) != AllocatedUSMMap.end());
+        return AllocatedUSMMap[Address];
+    }
+
+    ur_shared_mutex Mutex;
+    std::unordered_map<ur_device_handle_t, std::shared_ptr<DeviceInfo>>
+        DeviceMap;
+    std::unordered_map<ur_queue_handle_t, std::shared_ptr<QueueInfo>> QueueMap;
+
+    /// key: USMAllocInfo.AllocBegin
+    /// value: USMAllocInfo
+    /// Use AllocBegin as key can help to detect underflow pointer
+    std::map<uptr, std::shared_ptr<USMAllocInfo>> AllocatedUSMMap;
+};
+
+class SanitizerInterceptor {
+  public:
+    ur_result_t allocateMemory(ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               const ur_usm_desc_t *Properties,
+                               ur_usm_pool_handle_t Pool, size_t Size,
+                               void **ResultPtr, USMMemoryType Type);
+    ur_result_t releaseMemory(ur_context_handle_t Context, void *Ptr);
+
+    bool preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue,
+                         ur_event_handle_t &Event);
+    void postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue,
+                          ur_event_handle_t &Event);
+
+    ur_result_t insertContext(ur_context_handle_t Context);
+    ur_result_t eraseContext(ur_context_handle_t Context);
+
+    ur_result_t insertDevice(ur_context_handle_t Context,
+                             ur_device_handle_t Device);
+
+    ur_result_t insertQueue(ur_context_handle_t Context,
+                            ur_queue_handle_t Queue);
+    ur_result_t eraseQueue(ur_context_handle_t Context,
+                           ur_queue_handle_t Queue);
+
+  private:
+    ur_result_t updateShadowMemory(ur_queue_handle_t Queue);
+    ur_result_t enqueueAllocInfo(ur_context_handle_t Context,
+                                 ur_device_handle_t Device,
+                                 ur_queue_handle_t Queue,
+                                 std::shared_ptr<USMAllocInfo> &AlloccInfo,
+                                 ur_event_handle_t &LastEvent);
+
+    /// Initialize Global Variables & Kernel Name at first Launch
+    void prepareLaunch(ur_queue_handle_t Queue, ur_kernel_handle_t Kernel);
+
+    std::string getKernelName(ur_kernel_handle_t Kernel);
+    ur_result_t allocShadowMemory(ur_context_handle_t Context,
+                                  std::shared_ptr<DeviceInfo> &DeviceInfo);
+    ur_result_t enqueueMemSetShadow(ur_context_handle_t Context,
+                                    ur_device_handle_t Device,
+                                    ur_queue_handle_t Queue, uptr Addr,
+                                    uptr Size, u8 Value,
+                                    ur_event_handle_t DepEvent,
+                                    ur_event_handle_t *OutEvent);
+
+    std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
+        std::shared_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+        assert(m_ContextMap.find(Context) != m_ContextMap.end());
+        return m_ContextMap[Context];
+    }
+
+  private:
+    std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
+        m_ContextMap;
+    ur_shared_mutex m_ContextMapMutex;
+};
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp
new file mode 100644
index 0000000000..8b80814b9c
--- /dev/null
+++ b/source/loader/layers/sanitizer/common.hpp
@@ -0,0 +1,90 @@
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file common.hpp
+ *
+ */
+
+#pragma once
+
+#include "ur/ur.hpp"
+#include "ur_ddi.h"
+
+#include <cassert>
+#include <cstdint>
+
+namespace ur_sanitizer_layer {
+
+// ================================================================
+// Copy from LLVM compiler-rt/lib/asan
+
+using uptr = uintptr_t;
+using u8 = unsigned char;
+using u32 = unsigned int;
+
+constexpr unsigned ASAN_SHADOW_SCALE = 3;
+constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE;
+
+inline constexpr bool IsPowerOfTwo(uptr x) {
+    return (x & (x - 1)) == 0 && x != 0;
+}
+
+inline constexpr uptr RoundUpTo(uptr Size, uptr boundary) {
+    assert(IsPowerOfTwo(boundary));
+    return (Size + boundary - 1) & ~(boundary - 1);
+}
+
+inline constexpr uptr RoundDownTo(uptr x, uptr boundary) {
+    assert(IsPowerOfTwo(boundary));
+    return x & ~(boundary - 1);
+}
+
+inline constexpr bool IsAligned(uptr a, uptr alignment) {
+    return (a & (alignment - 1)) == 0;
+}
+
+// Valid redzone sizes are 16, 32, 64, ... 2048, so we encode them in 3 bits.
+// We use adaptive redzones: for larger allocation larger redzones are used.
+inline constexpr uptr RZLog2Size(uptr rz_log) {
+    assert(rz_log < 8);
+    return 16 << rz_log;
+}
+
+inline constexpr uptr ComputeRZLog(uptr user_requested_size) {
+    uptr rz_log = user_requested_size <= 64 - 16            ? 0
+                  : user_requested_size <= 128 - 32         ? 1
+                  : user_requested_size <= 512 - 64         ? 2
+                  : user_requested_size <= 4096 - 128       ? 3
+                  : user_requested_size <= (1 << 14) - 256  ? 4
+                  : user_requested_size <= (1 << 15) - 512  ? 5
+                  : user_requested_size <= (1 << 16) - 1024 ? 6
+                                                            : 7;
+    return rz_log;
+}
+
+// ================================================================
+
+// Trace an internal UR call; returns in case of an error.
+#define UR_CALL(Call)                                                          \
+    {                                                                          \
+        if (PrintTrace)                                                        \
+            context.logger.debug("UR ---> {}", #Call);                         \
+        ur_result_t Result = (Call);                                           \
+        if (PrintTrace)                                                        \
+            context.logger.debug("UR <--- {}({})", #Call, Result);             \
+        if (Result != UR_RESULT_SUCCESS)                                       \
+            return Result;                                                     \
+    }
+
+#ifndef NDEBUG
+#define UR_ASSERT_EQ(Call, Result) assert(Call == Result)
+#else
+#define UR_ASSERT_EQ(Call, Result) (void)Call
+#endif
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/device_sanitizer_report.hpp b/source/loader/layers/sanitizer/device_sanitizer_report.hpp
new file mode 100644
index 0000000000..11ae721434
--- /dev/null
+++ b/source/loader/layers/sanitizer/device_sanitizer_report.hpp
@@ -0,0 +1,95 @@
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file device_sanitizer_report.hpp
+ *
+ */
+
+#pragma once
+
+#include <cinttypes>
+
+namespace ur_sanitizer_layer {
+
+enum class DeviceSanitizerErrorType : int32_t {
+    UNKNOWN,
+    OUT_OF_BOUNDS,
+    MISALIGNED,
+    USE_AFTER_FREE,
+    OUT_OF_SHADOW_BOUNDS,
+};
+
+enum class DeviceSanitizerMemoryType : int32_t {
+    UNKNOWN,
+    USM_DEVICE,
+    USM_HOST,
+    USM_SHARED,
+    LOCAL,
+    PRIVATE,
+    MEM_BUFFER,
+};
+
+struct DeviceSanitizerReport {
+    int Flag = 0;
+
+    char File[256 + 1] = "";
+    char Func[256 + 1] = "";
+
+    int32_t Line = 0;
+
+    uint64_t GID0 = 0;
+    uint64_t GID1 = 0;
+    uint64_t GID2 = 0;
+
+    uint64_t LID0 = 0;
+    uint64_t LID1 = 0;
+    uint64_t LID2 = 0;
+
+    bool IsWrite = false;
+    uint32_t AccessSize = 0;
+    DeviceSanitizerMemoryType MemoryType;
+    DeviceSanitizerErrorType ErrorType;
+
+    bool IsRecover = false;
+};
+
+inline const char *DeviceSanitizerFormat(DeviceSanitizerMemoryType MemoryType) {
+    switch (MemoryType) {
+    case DeviceSanitizerMemoryType::USM_DEVICE:
+        return "USM Device Memory";
+    case DeviceSanitizerMemoryType::USM_HOST:
+        return "USM Host Memory";
+    case DeviceSanitizerMemoryType::USM_SHARED:
+        return "USM Shared Memory";
+    case DeviceSanitizerMemoryType::LOCAL:
+        return "Local Memory";
+    case DeviceSanitizerMemoryType::PRIVATE:
+        return "Private Memory";
+    case DeviceSanitizerMemoryType::MEM_BUFFER:
+        return "Memory Buffer";
+    default:
+        return "Unknown Memory";
+    }
+}
+
+inline const char *DeviceSanitizerFormat(DeviceSanitizerErrorType ErrorType) {
+    switch (ErrorType) {
+    case DeviceSanitizerErrorType::OUT_OF_BOUNDS:
+        return "out-of-bounds-access";
+    case DeviceSanitizerErrorType::MISALIGNED:
+        return "misaligned-access";
+    case DeviceSanitizerErrorType::USE_AFTER_FREE:
+        return "use-after-free";
+    case DeviceSanitizerErrorType::OUT_OF_SHADOW_BOUNDS:
+        return "out-of-shadow-bounds-access";
+    default:
+        return "unknown-error";
+    }
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
new file mode 100644
index 0000000000..64f54752ca
--- /dev/null
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -0,0 +1,485 @@
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ur_sanddi.cpp
+ *
+ */
+
+#include "asan_interceptor.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urUSMHostAlloc
+__urdlllocal ur_result_t UR_APICALL urUSMHostAlloc(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    const ur_usm_desc_t
+        *pUSMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t
+        pool, ///< [in][optional] Pointer to a pool created using urUSMPoolCreate
+    size_t
+        size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **ppMem ///< [out] pointer to USM host memory object
+) {
+    auto pfnHostAlloc = context.urDdiTable.USM.pfnHostAlloc;
+
+    if (nullptr == pfnHostAlloc) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    return context.interceptor->allocateMemory(
+        hContext, nullptr, pUSMDesc, pool, size, ppMem, USMMemoryType::HOST);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urUSMDeviceAlloc
+__urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    const ur_usm_desc_t
+        *pUSMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t
+        pool, ///< [in][optional] Pointer to a pool created using urUSMPoolCreate
+    size_t
+        size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **ppMem ///< [out] pointer to USM device memory object
+) {
+    auto pfnDeviceAlloc = context.urDdiTable.USM.pfnDeviceAlloc;
+
+    if (nullptr == pfnDeviceAlloc) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    return context.interceptor->allocateMemory(
+        hContext, hDevice, pUSMDesc, pool, size, ppMem, USMMemoryType::DEVICE);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urUSMSharedAlloc
+__urdlllocal ur_result_t UR_APICALL urUSMSharedAlloc(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    const ur_usm_desc_t *
+        pUSMDesc, ///< [in][optional] Pointer to USM memory allocation descriptor.
+    ur_usm_pool_handle_t
+        pool, ///< [in][optional] Pointer to a pool created using urUSMPoolCreate
+    size_t
+        size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **ppMem ///< [out] pointer to USM shared memory object
+) {
+    auto pfnSharedAlloc = context.urDdiTable.USM.pfnSharedAlloc;
+
+    if (nullptr == pfnSharedAlloc) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    return context.interceptor->allocateMemory(
+        hContext, hDevice, pUSMDesc, pool, size, ppMem, USMMemoryType::SHARE);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urUSMFree
+__urdlllocal ur_result_t UR_APICALL urUSMFree(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    void *pMem                    ///< [in] pointer to USM memory object
+) {
+    auto pfnFree = context.urDdiTable.USM.pfnFree;
+
+    if (nullptr == pfnFree) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    return context.interceptor->releaseMemory(hContext, pMem);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urQueueCreate
+__urdlllocal ur_result_t UR_APICALL urQueueCreate(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    const ur_queue_properties_t
+        *pProperties, ///< [in][optional] pointer to queue creation properties.
+    ur_queue_handle_t
+        *phQueue ///< [out] pointer to handle of queue object created
+) {
+    auto pfnCreate = context.urDdiTable.Queue.pfnCreate;
+
+    if (nullptr == pfnCreate) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_result_t result = pfnCreate(hContext, hDevice, pProperties, phQueue);
+    if (result == UR_RESULT_SUCCESS) {
+        result = context.interceptor->insertQueue(hContext, *phQueue);
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urQueueRelease
+__urdlllocal ur_result_t UR_APICALL urQueueRelease(
+    ur_queue_handle_t hQueue ///< [in] handle of the queue object to release
+) {
+    auto pfnRelease = context.urDdiTable.Queue.pfnRelease;
+
+    if (nullptr == pfnRelease) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_context_handle_t hContext;
+    UR_CALL(context.urDdiTable.Queue.pfnGetInfo(hQueue, UR_QUEUE_INFO_CONTEXT,
+                                                sizeof(ur_context_handle_t),
+                                                &hContext, nullptr));
+    UR_CALL(context.interceptor->eraseQueue(hContext, hQueue));
+
+    ur_result_t result = pfnRelease(hQueue);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunch
+__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
+                 ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of global work-items in workDim that will execute the kernel
+    ///< function
+    const size_t *
+        pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that
+    ///< specify the number of local work-items forming a work-group that will
+    ///< execute the kernel function.
+    ///< If nullptr, the runtime implementation will choose the work-group
+    ///< size.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    ///< event.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< kernel execution instance.
+) {
+    auto pfnKernelLaunch = context.urDdiTable.Enqueue.pfnKernelLaunch;
+
+    if (nullptr == pfnKernelLaunch) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_event_handle_t hPreEvent{};
+    std::vector<ur_event_handle_t> events(numEventsInWaitList + 1);
+    for (unsigned i = 0; i < numEventsInWaitList; ++i) {
+        events.push_back(phEventWaitList[i]);
+    }
+
+    // launchKernel must append to num_events_in_wait_list, not prepend
+    context.interceptor->preLaunchKernel(hKernel, hQueue, hPreEvent);
+    if (hPreEvent) {
+        events.push_back(hPreEvent);
+    }
+
+    ur_event_handle_t hEvent{};
+    ur_result_t result = pfnKernelLaunch(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent);
+
+    if (result == UR_RESULT_SUCCESS) {
+        context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent);
+    }
+
+    if (phEvent) {
+        *phEvent = hEvent;
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextCreate
+__urdlllocal ur_result_t UR_APICALL urContextCreate(
+    uint32_t numDevices, ///< [in] the number of devices given in phDevices
+    const ur_device_handle_t
+        *phDevices, ///< [in][range(0, numDevices)] array of handle of devices.
+    const ur_context_properties_t *
+        pProperties, ///< [in][optional] pointer to context creation properties.
+    ur_context_handle_t
+        *phContext ///< [out] pointer to handle of context object created
+) {
+    auto pfnCreate = context.urDdiTable.Context.pfnCreate;
+
+    if (nullptr == pfnCreate) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_result_t result =
+        pfnCreate(numDevices, phDevices, pProperties, phContext);
+
+    if (result == UR_RESULT_SUCCESS) {
+        auto Context = *phContext;
+        result = context.interceptor->insertContext(Context);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+        for (uint32_t i = 0; i < numDevices; ++i) {
+            result = context.interceptor->insertDevice(Context, phDevices[i]);
+            if (result != UR_RESULT_SUCCESS) {
+                return result;
+            }
+        }
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextCreateWithNativeHandle
+__urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle(
+    ur_native_handle_t
+        hNativeContext,  ///< [in][nocheck] the native handle of the context.
+    uint32_t numDevices, ///< [in] number of devices associated with the context
+    const ur_device_handle_t *
+        phDevices, ///< [in][range(0, numDevices)] list of devices associated with the context
+    const ur_context_native_properties_t *
+        pProperties, ///< [in][optional] pointer to native context properties struct
+    ur_context_handle_t *
+        phContext ///< [out] pointer to the handle of the context object created.
+) {
+    auto pfnCreateWithNativeHandle =
+        context.urDdiTable.Context.pfnCreateWithNativeHandle;
+
+    if (nullptr == pfnCreateWithNativeHandle) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_result_t result = pfnCreateWithNativeHandle(
+        hNativeContext, numDevices, phDevices, pProperties, phContext);
+
+    if (result == UR_RESULT_SUCCESS) {
+        auto Context = *phContext;
+        result = context.interceptor->insertContext(Context);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+        for (uint32_t i = 0; i < numDevices; ++i) {
+            result = context.interceptor->insertDevice(Context, phDevices[i]);
+            if (result != UR_RESULT_SUCCESS) {
+                return result;
+            }
+        }
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextRelease
+__urdlllocal ur_result_t UR_APICALL urContextRelease(
+    ur_context_handle_t hContext ///< [in] handle of the context to release.
+) {
+    auto pfnRelease = context.urDdiTable.Context.pfnRelease;
+
+    if (nullptr == pfnRelease) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    UR_CALL(context.interceptor->eraseContext(hContext));
+    ur_result_t result = pfnRelease(hContext);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Context table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetContextProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_context_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::urContextCreate;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urContextRelease;
+
+    pDdiTable->pfnCreateWithNativeHandle =
+        ur_sanitizer_layer::urContextCreateWithNativeHandle;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Enqueue table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_enqueue_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnKernelLaunch = ur_sanitizer_layer::urEnqueueKernelLaunch;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Queue table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetQueueProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_queue_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::urQueueCreate;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urQueueRelease;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's USM table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetUSMProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_usm_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnDeviceAlloc = ur_sanitizer_layer::urUSMDeviceAlloc;
+
+    return result;
+}
+
+ur_result_t context_t::init(ur_dditable_t *dditable,
+                            const std::set<std::string> &enabledLayerNames,
+                            [[maybe_unused]] codeloc_data codelocData) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    if (enabledLayerNames.count("UR_LAYER_ASAN")) {
+        context.enabledType = SanitizerType::AddressSanitizer;
+    } else if (enabledLayerNames.count("UR_LAYER_MSAN")) {
+        context.enabledType = SanitizerType::MemorySanitizer;
+    } else if (enabledLayerNames.count("UR_LAYER_TSAN")) {
+        context.enabledType = SanitizerType::ThreadSanitizer;
+    }
+
+    // Only support AddressSanitizer now
+    if (context.enabledType != SanitizerType::AddressSanitizer) {
+        return result;
+    }
+
+    if (context.enabledType == SanitizerType::AddressSanitizer) {
+        if (!(dditable->VirtualMem.pfnReserve && dditable->VirtualMem.pfnMap &&
+              dditable->VirtualMem.pfnGranularityGetInfo)) {
+            die("Some VirtualMem APIs are needed to enable UR_LAYER_ASAN");
+        }
+
+        if (!dditable->PhysicalMem.pfnCreate) {
+            die("Some PhysicalMem APIs are needed to enable UR_LAYER_ASAN");
+        }
+    }
+
+    urDdiTable = *dditable;
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetContextProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Context);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetEnqueueProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Enqueue);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetQueueProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Queue);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetUSMProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->USM);
+    }
+
+    return result;
+}
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
new file mode 100644
index 0000000000..0df123b6c2
--- /dev/null
+++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
@@ -0,0 +1,30 @@
+/*
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ur_sanitizer_layer.cpp
+ *
+ */
+
+#include "ur_sanitizer_layer.hpp"
+#include "asan_interceptor.hpp"
+
+namespace ur_sanitizer_layer {
+context_t context;
+
+///////////////////////////////////////////////////////////////////////////////
+context_t::context_t()
+    : interceptor(std::make_unique<SanitizerInterceptor>()),
+      logger(logger::create_logger("sanitizer")) {}
+
+bool context_t::isAvailable() const { return true; }
+
+ur_result_t context_t::tearDown() { return UR_RESULT_SUCCESS; }
+
+///////////////////////////////////////////////////////////////////////////////
+context_t::~context_t() {}
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp
new file mode 100644
index 0000000000..018d9f4a80
--- /dev/null
+++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp
@@ -0,0 +1,55 @@
+/*
+ *
+ * Copyright (C) 2023 Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ur_sanitizer_layer.hpp
+ *
+ */
+
+#pragma once
+
+#include "logger/ur_logger.hpp"
+#include "ur_proxy_layer.hpp"
+
+#define SANITIZER_COMP_NAME "sanitizer layer"
+
+namespace ur_sanitizer_layer {
+
+class SanitizerInterceptor;
+
+enum class SanitizerType {
+    None,
+    AddressSanitizer,
+    MemorySanitizer,
+    ThreadSanitizer,
+};
+
+///////////////////////////////////////////////////////////////////////////////
+class __urdlllocal context_t : public proxy_layer_context_t {
+  public:
+    ur_dditable_t urDdiTable = {};
+    std::unique_ptr<SanitizerInterceptor> interceptor;
+    logger::Logger logger;
+    SanitizerType enabledType = SanitizerType::None;
+
+    context_t();
+    ~context_t();
+
+    bool isAvailable() const override;
+
+    std::vector<std::string> getNames() const override {
+        return {"UR_LAYER_ASAN", "UR_LAYER_MSAN", "UR_LAYER_TSAN"};
+    }
+    ur_result_t init(ur_dditable_t *dditable,
+                     const std::set<std::string> &enabledLayerNames,
+                     codeloc_data codelocData) override;
+
+    ur_result_t tearDown() override;
+};
+
+extern context_t context;
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/ur_lib.hpp b/source/loader/ur_lib.hpp
index 9d1e02a67e..41ab7cb52e 100644
--- a/source/loader/ur_lib.hpp
+++ b/source/loader/ur_lib.hpp
@@ -23,6 +23,9 @@
 #if UR_ENABLE_TRACING
 #include "tracing/ur_tracing_layer.hpp"
 #endif
+#if UR_ENABLE_SANITIZER
+#include "sanitizer/ur_sanitizer_layer.hpp"
+#endif
 
 #include <atomic>
 #include <mutex>
@@ -69,7 +72,10 @@ class __urdlllocal context_t {
     const std::vector<proxy_layer_context_t *> layers = {
         &ur_validation_layer::context,
 #if UR_ENABLE_TRACING
-        &ur_tracing_layer::context
+        &ur_tracing_layer::context,
+#endif
+#if UR_ENABLE_SANITIZER
+        &ur_sanitizer_layer::context
 #endif
     };
     std::string availableLayers;