diff --git a/.github/scripts/get_system_info.sh b/.github/scripts/get_system_info.sh
index 6ca38a33ef..8301c99099 100755
--- a/.github/scripts/get_system_info.sh
+++ b/.github/scripts/get_system_info.sh
@@ -53,7 +53,7 @@ function system_info {
 	echo "**********/proc/meminfo**********"
 	cat /proc/meminfo
 	echo "**********build/bin/urinfo**********"
-	$(dirname "$(readlink -f "$0")")/../../build/bin/urinfo || true
+	$(dirname "$(readlink -f "$0")")/../../build/bin/urinfo --no-linear-ids --verbose || true
 	echo "******OpenCL*******"
 	# The driver version of OpenCL Graphics is the compute-runtime version
 	clinfo || echo "OpenCL not installed"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f25c8c14ac..7dfc5c017b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
 project(unified-runtime VERSION 0.9.0)
 
 include(GNUInstallDirs)
diff --git a/README.md b/README.md
index db5292a9c3..d7caf11c2b 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@
 ## Table of contents
 
 - [Unified Runtime](#unified-runtime)
-  - [Adapters](#adapters)
   - [Table of contents](#table-of-contents)
   - [Contents of the repo](#contents-of-the-repo)
   - [Integration](#integration)
@@ -29,7 +28,7 @@
     - [Adapter naming convention](#adapter-naming-convention)
     - [Source code generation](#source-code-generation)
     - [Documentation](#documentation)
-6. [Release Process](#release-process)
+  - [Release Process](#release-process)
 
 ## Contents of the repo
 
@@ -88,7 +87,7 @@ for more detailed instructions on the correct setup.
 
 Required packages:
 - C++ compiler with C++17 support
-- [CMake](https://cmake.org/) >= 3.14.0
+- [CMake](https://cmake.org/) >= 3.20.0
 - Python v3.6.6 or later
 
 ### Windows
@@ -141,6 +140,7 @@ List of options provided by CMake:
 | UR_HIP_PLATFORM         | Build HIP adapter for AMD or NVIDIA platform           | AMD/NVIDIA | AMD     |
 | UR_ENABLE_COMGR         | Enable comgr lib usage           | AMD/NVIDIA | AMD     |
 | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |
+| UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` |
 | UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` |
 | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` |
 | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` |
diff --git a/include/ur_api.h b/include/ur_api.h
index 93dd26cd50..3c6dcd75b4 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -222,6 +222,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP = 220,                 ///< Enumerator for ::urCommandBufferUpdateKernelLaunchExp
     UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221,                             ///< Enumerator for ::urCommandBufferGetInfoExp
     UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222,                     ///< Enumerator for ::urCommandBufferCommandGetInfoExp
+    UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223,                         ///< Enumerator for ::urEnqueueTimestampRecordingExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1641,6 +1642,7 @@ typedef enum ur_device_info_t {
                                                                      ///< backed 3D sampled image data.
     UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017,     ///< [::ur_bool_t] returns true if the device is capable of fetching
                                                                      ///< non-USM backed 3D sampled image data.
+    UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018,         ///< [::ur_bool_t] returns true if the device supports timestamp recording
     /// @cond
     UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1666,7 +1668,7 @@ typedef enum ur_device_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -5618,6 +5620,7 @@ typedef enum ur_command_t {
     UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000,   ///< Event created by ::urCommandBufferEnqueueExp
     UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000,   ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp
     UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp
+    UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002,      ///< Event created by ::urEnqueueTimestampRecordingExp
     /// @cond
     UR_COMMAND_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -8890,6 +8893,46 @@ urKernelSuggestMaxCooperativeGroupCountExp(
     uint32_t *pGroupCountRet        ///< [out] pointer to maximum number of groups
 );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings
+#if !defined(__GNUC__)
+#pragma region enqueue timestamp recording(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue,                 ///< [in] handle of the queue object
+    bool blocking,                            ///< [in] indicates whether the call to this function should block until
+                                              ///< until the device timestamp recording command has executed on the
+                                              ///< device.
+    uint32_t numEventsInWaitList,             ///< [in] size of the event wait list
+    const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+                                              ///< events that must be complete before the kernel execution.
+                                              ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+                                              ///< events.
+    ur_event_handle_t *phEvent                ///< [in,out] return an event object that identifies this particular kernel
+                                              ///< execution instance. Profiling information can be queried
+                                              ///< from this event as if `hQueue` had profiling enabled. Querying
+                                              ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+                                              ///< reports the timestamp at the time of the call to this function.
+                                              ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+                                              ///< reports the timestamp recorded when the command is executed on the device.
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -10600,6 +10643,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t {
     ur_event_handle_t **pphEvent;
 } ur_enqueue_cooperative_kernel_launch_exp_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urEnqueueTimestampRecordingExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_enqueue_timestamp_recording_exp_params_t {
+    ur_queue_handle_t *phQueue;
+    bool *pblocking;
+    uint32_t *pnumEventsInWaitList;
+    const ur_event_handle_t **pphEventWaitList;
+    ur_event_handle_t **pphEvent;
+} ur_enqueue_timestamp_recording_exp_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 5a069f0881..dcd8915fa2 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1448,10 +1448,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)(
     const ur_event_handle_t *,
     ur_event_handle_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urEnqueueTimestampRecordingExp
+typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)(
+    ur_queue_handle_t,
+    bool,
+    uint32_t,
+    const ur_event_handle_t *,
+    ur_event_handle_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Table of EnqueueExp functions pointers
 typedef struct ur_enqueue_exp_dditable_t {
     ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp;
+    ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp;
 } ur_enqueue_exp_dditable_t;
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/ur_print.h b/include/ur_print.h
index 3126c1714b..3377980ce7 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1954,6 +1954,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 3d48ae9a35..5472e4b35f 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -916,6 +916,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP";
         break;
+    case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP:
+        os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -2571,6 +2574,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
     case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP:
         os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP";
         break;
+    case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+        os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -4280,6 +4286,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
 
         os << ")";
     } break;
+    case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -8788,6 +8806,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) {
     case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP:
         os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP";
         break;
+    case UR_COMMAND_TIMESTAMP_RECORDING_EXP:
+        os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -14104,6 +14125,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) {
+
+    os << ".hQueue = ";
+
+    ur::details::printPtr(os,
+                          *(params->phQueue));
+
+    os << ", ";
+    os << ".blocking = ";
+
+    os << *(params->pblocking);
+
+    os << ", ";
+    os << ".numEventsInWaitList = ";
+
+    os << *(params->pnumEventsInWaitList);
+
+    os << ", ";
+    os << ".phEventWaitList = {";
+    for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) {
+        if (i != 0) {
+            os << ", ";
+        }
+
+        ur::details::printPtr(os,
+                              (*(params->pphEventWaitList))[i]);
+    }
+    os << "}";
+
+    os << ", ";
+    os << ".phEvent = ";
+
+    ur::details::printPtr(os,
+                          *(params->pphEvent));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type
 /// @returns
@@ -17126,6 +17189,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: {
         os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params;
     } break;
+    case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: {
+        os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params;
+    } break;
     case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: {
         os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params;
     } break;
diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
new file mode 100644
index 0000000000..3cf46095b2
--- /dev/null
+++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
@@ -0,0 +1,70 @@
+<%
+    OneApi=tags['$OneApi']
+    x=tags['$x']
+    X=x.upper()
+%>
+
+.. _experimental-enqueue-timestamp-recording:
+
+================================================================================
+Enqueue Timestamp Recording
+================================================================================
+
+.. warning::
+
+    Experimental features:
+
+    *   May be replaced, updated, or removed at any time.
+    *   Do not require maintaining API/ABI stability of their own additions over
+        time.
+    *   Do not require conformance testing of their own additions.
+
+
+Motivation
+--------------------------------------------------------------------------------
+Currently, the only way to get timestamp information is through enabling
+profiling on a queue and retrieving the information from events coming from
+commands submitted to it. However, not all systems give full control of the
+queue construction to the programmer wanting the profiling information. To amend
+this, this extension adds the ability to enqueue a timestamp recording on any
+queue, with or without profiling enabled. This event can in turn be queried for
+the usual profiling information.
+
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* ${x}_device_info_t
+    * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP
+
+* ${x}_command_t
+    * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}EnqueueTimestampRecordingExp
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+------------------------+
+| Revision  | Changes                |
++===========+========================+
+| 1.0       | Initial Draft          |
++-----------+------------------------+
+
+
+Support
+--------------------------------------------------------------------------------
+
+Adapters which support this experimental feature *must* return true for the new
+`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query.
+
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Steffen Larsen `steffen.larsen@intel.com <steffen.larsen@intel.com>`_
diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml
new file mode 100644
index 0000000000..18316f734e
--- /dev/null
+++ b/scripts/core/exp-enqueue-timestamp-recording.yml
@@ -0,0 +1,66 @@
+#
+# Copyright (C) 2024 Intel Corporation
+#
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# See YaML.md for syntax definition
+#
+--- #--------------------------------------------------------------------------
+type: header
+desc: "Intel $OneApi Unified Runtime Experimental APIs for enqueuing timestamp recordings"
+ordinal: "99"
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+typed_etors: true
+desc: "Extension enums to $x_device_info_t to support timestamp recordings."
+name: $x_device_info_t
+etors: 
+    - name: TIMESTAMP_RECORDING_SUPPORT_EXP
+      value: "0x2018"
+      desc: "[$x_bool_t] returns true if the device supports timestamp recording"
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+desc: "Command Type experimental enumerations."
+name: $x_command_t
+etors:
+    - name: TIMESTAMP_RECORDING_EXP
+      value: "0x2002"
+      desc: Event created by $xEnqueueTimestampRecordingExp
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Enqueue a command for recording the device timestamp"
+class: $xEnqueue
+name: TimestampRecordingExp
+params:
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: "[in] handle of the queue object"
+    - type: bool
+      name: blocking
+      desc: |
+            [in] indicates whether the call to this function should block until
+            until the device timestamp recording command has executed on the
+            device.
+    - type: uint32_t
+      name: numEventsInWaitList
+      desc: "[in] size of the event wait list"
+    - type: "const $x_event_handle_t*"
+      name: phEventWaitList
+      desc: |
+            [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution.
+            If nullptr, the numEventsInWaitList must be 0, indicating no wait events. 
+    - type: $x_event_handle_t*
+      name: phEvent
+      desc: |
+            [in,out] return an event object that identifies this particular kernel execution instance. Profiling information can be queried
+            from this event as if `hQueue` had profiling enabled. Querying `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+            reports the timestamp at the time of the call to this function. Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+            reports the timestamp recorded when the command is executed on the device.
+returns:
+    - $X_RESULT_ERROR_INVALID_NULL_HANDLE
+    - $X_RESULT_ERROR_INVALID_NULL_POINTER
+    - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 5ae70f7bd1..6a551d5821 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -580,6 +580,9 @@ etors:
 - name: COMMAND_BUFFER_COMMAND_GET_INFO_EXP
   desc: Enumerator for $xCommandBufferCommandGetInfoExp
   value: '222'
+- name: ENQUEUE_TIMESTAMP_RECORDING_EXP
+  desc: Enumerator for $xEnqueueTimestampRecordingExp
+  value: '223'
 ---
 type: enum
 desc: Defines structure types
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index 84a9e0405b..d83269f2ae 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -10,155 +10,13 @@
 
 #include <ur/ur.hpp>
 #include <ur_api.h>
+#include <ur_print.hpp>
 
 #include "context.hpp"
 #include "logger/ur_logger.hpp"
 #include <cuda.h>
 #include <memory>
 
-static inline const char *getUrResultString(ur_result_t Result) {
-  switch (Result) {
-  case UR_RESULT_SUCCESS:
-    return "UR_RESULT_SUCCESS";
-  case UR_RESULT_ERROR_INVALID_OPERATION:
-    return "UR_RESULT_ERROR_INVALID_OPERATION";
-  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
-    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
-  case UR_RESULT_ERROR_INVALID_QUEUE:
-    return "UR_RESULT_ERROR_INVALID_QUEUE";
-  case UR_RESULT_ERROR_INVALID_VALUE:
-    return "UR_RESULT_ERROR_INVALID_VALUE";
-  case UR_RESULT_ERROR_INVALID_CONTEXT:
-    return "UR_RESULT_ERROR_INVALID_CONTEXT";
-  case UR_RESULT_ERROR_INVALID_PLATFORM:
-    return "UR_RESULT_ERROR_INVALID_PLATFORM";
-  case UR_RESULT_ERROR_INVALID_BINARY:
-    return "UR_RESULT_ERROR_INVALID_BINARY";
-  case UR_RESULT_ERROR_INVALID_PROGRAM:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM";
-  case UR_RESULT_ERROR_INVALID_SAMPLER:
-    return "UR_RESULT_ERROR_INVALID_SAMPLER";
-  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
-    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
-  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
-  case UR_RESULT_ERROR_INVALID_EVENT:
-    return "UR_RESULT_ERROR_INVALID_EVENT";
-  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
-    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
-  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
-    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
-  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
-  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
-    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
-  case UR_RESULT_ERROR_INVALID_DEVICE:
-    return "UR_RESULT_ERROR_INVALID_DEVICE";
-  case UR_RESULT_ERROR_DEVICE_LOST:
-    return "UR_RESULT_ERROR_DEVICE_LOST";
-  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
-    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
-  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
-    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
-  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
-    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
-  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
-    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
-  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
-  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
-  case UR_RESULT_ERROR_INVALID_KERNEL:
-    return "UR_RESULT_ERROR_INVALID_KERNEL";
-  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
-  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
-  case UR_RESULT_ERROR_UNINITIALIZED:
-    return "UR_RESULT_ERROR_UNINITIALIZED";
-  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
-    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
-  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
-  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
-  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
-  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
-  case UR_RESULT_ERROR_INVALID_ARGUMENT:
-    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
-  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
-    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
-  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
-    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
-  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
-    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
-  case UR_RESULT_ERROR_INVALID_SIZE:
-    return "UR_RESULT_ERROR_INVALID_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
-  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
-  case UR_RESULT_ERROR_INVALID_ENUMERATION:
-    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
-  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
-    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
-  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
-    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
-  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
-  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
-    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
-  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
-    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
-  case UR_RESULT_ERROR_INVALID_HOST_PTR:
-    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
-  case UR_RESULT_ERROR_INVALID_USM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
-  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
-    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
-  default:
-    return "UR_RESULT_ERROR_UNKNOWN";
-  }
-}
-
 // Trace an internal UR call
 #define UR_TRACE(Call)                                                         \
   {                                                                            \
@@ -173,7 +31,7 @@ static inline const char *getUrResultString(ur_result_t Result) {
       logger::always("UR ---> {}", #Call);                                     \
     Result = (Call);                                                           \
     if (PrintTrace)                                                            \
-      logger::always("UR <--- {}({})", #Call, getUrResultString(Result));      \
+      logger::always("UR <--- {}({})", #Call, Result);                         \
   }
 
 // Handle to a kernel command.
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index cca00c0b85..949b58666e 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -950,6 +950,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // CUDA does support fetching 3D non-USM sampled image data.
     return ReturnValue(true);
   }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+    // CUDA supports recording timestamp events.
+    return ReturnValue(true);
+  }
   case UR_DEVICE_INFO_DEVICE_ID: {
     int Value = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 6f99941095..812a41768e 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -1709,3 +1709,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextComputeStream();
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
+
+    RetImplEvent =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+            UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream));
+    UR_CHECK_ERROR(RetImplEvent->start());
+    UR_CHECK_ERROR(RetImplEvent->record());
+
+    if (blocking) {
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+
+    *phEvent = RetImplEvent.release();
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index ac66bf479e..1e8f2dd384 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
       // NOTE: This relies on the default stream to be unused.
       UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
       UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
@@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() {
 
   UR_CHECK_ERROR(cuEventDestroy(EvEnd));
 
-  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
     UR_CHECK_ERROR(cuEventDestroy(EvQueued));
     UR_CHECK_ERROR(cuEventDestroy(EvStart));
   }
@@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   ur_queue_handle_t Queue = hEvent->getQueue();
-  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+  if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+                           !hEvent->isTimestampEvent())) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp
index 390fd7833a..5ed68f0f25 100644
--- a/source/adapters/cuda/event.hpp
+++ b/source/adapters/cuda/event.hpp
@@ -57,6 +57,10 @@ struct ur_event_handle_t_ {
     return UR_EVENT_STATUS_COMPLETE;
   }
 
+  bool isTimestampEvent() const noexcept {
+    return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+  }
+
   ur_context_handle_t getContext() const noexcept { return Context; };
 
   uint32_t incrementReferenceCount() { return ++RefCount; }
@@ -83,13 +87,14 @@ struct ur_event_handle_t_ {
   static ur_event_handle_t
   makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream,
              uint32_t StreamToken = std::numeric_limits<uint32_t>::max()) {
-    const bool ProfilingEnabled =
-        Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+    const bool RequiresTimings =
+        Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE ||
+        Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
     native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr;
     UR_CHECK_ERROR(cuEventCreate(
-        &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+        &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
 
-    if (ProfilingEnabled) {
+    if (RequiresTimings) {
       UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT));
       UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT));
     }
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index 2ffc0755ee..80fd211863 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -407,6 +407,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 59d3ba70fd..6faa1e5320 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -33,7 +33,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+    return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
   }
 
   auto UMFPool = hPool->HostMemPool.get();
@@ -57,7 +57,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+    return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
                               alignment);
   }
 
@@ -82,8 +82,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
-                              alignment);
+    return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+                              /*device flags*/ 0, size, alignment);
   }
 
   auto UMFPool = hPool->SharedMemPool.get();
@@ -132,7 +132,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
 }
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_device_mem_flags_t *,
+                               ur_device_handle_t, ur_usm_device_mem_flags_t,
                                size_t Size, uint32_t Alignment) {
   try {
     ScopedContext Active(Context);
@@ -151,8 +151,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_device_handle_t, ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment) {
   try {
     ScopedContext Active(Context);
@@ -172,7 +172,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *, size_t Size,
+                             ur_usm_host_mem_flags_t, size_t Size,
                              uint32_t Alignment) {
   try {
     ScopedContext Active(Context);
@@ -358,19 +358,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                 uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp
index 2ec3df150f..e5d1f7fbaa 100644
--- a/source/adapters/cuda/usm.hpp
+++ b/source/adapters/cuda/usm.hpp
@@ -118,15 +118,15 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               ur_usm_device_mem_flags_t Flags, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             ur_usm_host_mem_flags_t Flags, size_t Size,
                              uint32_t Alignment);
diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp
index 50fddc5448..751fde3720 100644
--- a/source/adapters/hip/command_buffer.hpp
+++ b/source/adapters/hip/command_buffer.hpp
@@ -10,154 +10,12 @@
 
 #include <ur/ur.hpp>
 #include <ur_api.h>
+#include <ur_print.hpp>
 
 #include "context.hpp"
 #include <hip/hip_runtime.h>
 #include <memory>
 
-static inline const char *getUrResultString(ur_result_t Result) {
-  switch (Result) {
-  case UR_RESULT_SUCCESS:
-    return "UR_RESULT_SUCCESS";
-  case UR_RESULT_ERROR_INVALID_OPERATION:
-    return "UR_RESULT_ERROR_INVALID_OPERATION";
-  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
-    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
-  case UR_RESULT_ERROR_INVALID_QUEUE:
-    return "UR_RESULT_ERROR_INVALID_QUEUE";
-  case UR_RESULT_ERROR_INVALID_VALUE:
-    return "UR_RESULT_ERROR_INVALID_VALUE";
-  case UR_RESULT_ERROR_INVALID_CONTEXT:
-    return "UR_RESULT_ERROR_INVALID_CONTEXT";
-  case UR_RESULT_ERROR_INVALID_PLATFORM:
-    return "UR_RESULT_ERROR_INVALID_PLATFORM";
-  case UR_RESULT_ERROR_INVALID_BINARY:
-    return "UR_RESULT_ERROR_INVALID_BINARY";
-  case UR_RESULT_ERROR_INVALID_PROGRAM:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM";
-  case UR_RESULT_ERROR_INVALID_SAMPLER:
-    return "UR_RESULT_ERROR_INVALID_SAMPLER";
-  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
-    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
-  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
-  case UR_RESULT_ERROR_INVALID_EVENT:
-    return "UR_RESULT_ERROR_INVALID_EVENT";
-  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
-    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
-  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
-    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
-  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
-  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
-    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
-  case UR_RESULT_ERROR_INVALID_DEVICE:
-    return "UR_RESULT_ERROR_INVALID_DEVICE";
-  case UR_RESULT_ERROR_DEVICE_LOST:
-    return "UR_RESULT_ERROR_DEVICE_LOST";
-  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
-    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
-  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
-    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
-  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
-    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
-  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
-    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
-  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
-  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
-  case UR_RESULT_ERROR_INVALID_KERNEL:
-    return "UR_RESULT_ERROR_INVALID_KERNEL";
-  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
-  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
-  case UR_RESULT_ERROR_UNINITIALIZED:
-    return "UR_RESULT_ERROR_UNINITIALIZED";
-  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
-    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
-  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
-  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
-  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
-  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
-  case UR_RESULT_ERROR_INVALID_ARGUMENT:
-    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
-  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
-    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
-  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
-    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
-  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
-    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
-  case UR_RESULT_ERROR_INVALID_SIZE:
-    return "UR_RESULT_ERROR_INVALID_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
-  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
-  case UR_RESULT_ERROR_INVALID_ENUMERATION:
-    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
-  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
-    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
-  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
-    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
-  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
-  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
-    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
-  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
-    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
-  case UR_RESULT_ERROR_INVALID_HOST_PTR:
-    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
-  case UR_RESULT_ERROR_INVALID_USM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
-  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
-    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
-  default:
-    return "UR_RESULT_ERROR_UNKNOWN";
-  }
-}
-
 // Trace an internal UR call
 #define UR_TRACE(Call)                                                         \
   {                                                                            \
@@ -169,10 +27,10 @@ static inline const char *getUrResultString(ur_result_t Result) {
 #define UR_CALL(Call, Result)                                                  \
   {                                                                            \
     if (PrintTrace)                                                            \
-      fprintf(stderr, "UR ---> %s\n", #Call);                                  \
+      std::cerr << "UR ---> " << #Call << "\n";                                \
     Result = (Call);                                                           \
     if (PrintTrace)                                                            \
-      fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result));   \
+      std::cerr << "UR <--- " << #Call << "(" << Result << ")\n";              \
   }
 
 // Handle to a kernel command.
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index ae4dbe159e..b30ae7e7f3 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -841,6 +841,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_COMPOSITE_DEVICE:
     // These two are exclusive of L0.
     return ReturnValue(0);
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+    return ReturnValue(true);
 
   // TODO: Investigate if this information is available on HIP.
   case UR_DEVICE_INFO_GPU_EU_COUNT:
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 79522d4c93..e6e3dd73fa 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1967,3 +1967,36 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
                      : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice
                                                        : hipMemcpyHostToHost));
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  try {
+    ScopedContext Active(hQueue->getDevice());
+
+    uint32_t StreamToken;
+    ur_stream_quard Guard;
+    hipStream_t HIPStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
+
+    RetImplEvent =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+            UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream));
+    UR_CHECK_ERROR(RetImplEvent->start());
+    UR_CHECK_ERROR(RetImplEvent->record());
+
+    if (blocking) {
+      UR_CHECK_ERROR(hipStreamSynchronize(HIPStream));
+    }
+
+    *phEvent = RetImplEvent.release();
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 7478c0e778..5327c43a3b 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
       StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr},
       EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
 
-  bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+  bool ProfilingEnabled =
+      Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent();
 
   UR_CHECK_ERROR(hipEventCreateWithFlags(
       &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming));
@@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
       // NOTE: This relies on the default stream to be unused.
       UR_CHECK_ERROR(hipEventRecord(EvQueued, 0));
       UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get()));
@@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() {
   assert(Queue != nullptr);
   UR_CHECK_ERROR(hipEventDestroy(EvEnd));
 
-  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
     UR_CHECK_ERROR(hipEventDestroy(EvQueued));
     UR_CHECK_ERROR(hipEventDestroy(EvStart));
   }
@@ -244,7 +245,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE);
 
   ur_queue_handle_t Queue = hEvent->getQueue();
-  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+  if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+                           !hEvent->isTimestampEvent())) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp
index 50de73b14f..64e8b2d9c8 100644
--- a/source/adapters/hip/event.hpp
+++ b/source/adapters/hip/event.hpp
@@ -55,6 +55,10 @@ struct ur_event_handle_t_ {
     return UR_EVENT_STATUS_COMPLETE;
   }
 
+  bool isTimestampEvent() const noexcept {
+    return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+  }
+
   ur_context_handle_t getContext() const noexcept { return Context; };
 
   uint32_t incrementReferenceCount() { return ++RefCount; }
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 1cf9431fcb..71979b75b1 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -377,6 +377,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 4068c1d865..e871f394f2 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -27,7 +27,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+    return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
   }
 
   return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -43,7 +43,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+    return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
                               alignment);
   }
 
@@ -60,8 +60,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
-                              alignment);
+    return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+                              /*device flags*/ 0, size, alignment);
   }
 
   return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -105,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                [[maybe_unused]] uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
@@ -120,8 +120,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                [[maybe_unused]] uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
@@ -136,7 +136,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
 
 ur_result_t USMHostAllocImpl(void **ResultPtr,
                              [[maybe_unused]] ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *, size_t Size,
+                             ur_usm_host_mem_flags_t, size_t Size,
                              [[maybe_unused]] uint32_t Alignment) {
   try {
     UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size));
@@ -309,19 +309,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                 uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp
index d02145584f..a1c3964263 100644
--- a/source/adapters/hip/usm.hpp
+++ b/source/adapters/hip/usm.hpp
@@ -118,17 +118,17 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               ur_usm_device_mem_flags_t Flags, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             ur_usm_host_mem_flags_t Flags, size_t Size,
                              uint32_t Alignment);
 
 bool checkUSMAlignment(uint32_t &alignment, const ur_usm_desc_t *pUSMDesc);
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index 7ae7272355..926d5f4ba4 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -88,7 +88,11 @@ bool setEnvVar(const char *name, const char *value) {
 ZeUSMImportExtension ZeUSMImport;
 
 // This will count the calls to Level-Zero
+// TODO: remove the ifdef once
+// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented
+#ifndef UR_L0_CALL_COUNT_IN_TESTS
 std::map<std::string, int> *ZeCallCount = nullptr;
+#endif
 
 inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
   switch (ZeError) {
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 19696142f5..85fe582ec3 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -679,6 +679,11 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
       bool batchingAllowed = true;
+      if (ForcedCmdQueue &&
+          CommandBatch.OpenCommandList->second.ZeQueue != *ForcedCmdQueue) {
+        // Current open batch doesn't match the forced command queue
+        batchingAllowed = false;
+      }
       if (!UrL0OutOfOrderIntegratedSignalEvent &&
           Queue->Device->isIntegrated()) {
         batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 7f9e9b499b..9ed752d46c 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -188,8 +188,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   }
   case UR_DEVICE_INFO_ATOMIC_64:
     return ReturnValue(
-        static_cast<uint32_t>(Device->ZeDeviceModuleProperties->flags &
-                              ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
+        static_cast<ur_bool_t>(Device->ZeDeviceModuleProperties->flags &
+                               ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
   case UR_DEVICE_INFO_EXTENSIONS: {
     // Convention adopted from OpenCL:
     //     "Returns a space separated list of extension names (the extension
@@ -258,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
     uint32_t MaxComputeUnits =
         Device->ZeDeviceProperties->numEUsPerSubslice *
@@ -410,7 +410,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
     return ReturnValue("");
   case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
     return ReturnValue(
         size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
@@ -427,7 +427,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(ur_device_exec_capability_flag_t{
         UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
   case UR_DEVICE_INFO_ENDIAN_LITTLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
     return ReturnValue(static_cast<uint32_t>(Device->ZeDeviceProperties->flags &
                                              ZE_DEVICE_PROPERTY_FLAG_ECC));
@@ -604,7 +604,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   }
   case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
     // TODO: Not supported yet. Needs to be updated after support is added.
-    return ReturnValue(static_cast<uint32_t>(false));
+    return ReturnValue(static_cast<ur_bool_t>(false));
   }
   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
@@ -790,7 +790,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return UR_RESULT_ERROR_INVALID_VALUE;
   case UR_DEVICE_INFO_BFLOAT16: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
-    return ReturnValue(bool{false});
+    return ReturnValue(ur_bool_t{false});
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
     // There are no explicit restrictions in L0 programming guide, so assume all
@@ -839,9 +839,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(capabilities);
   }
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(uint32_t{false});
+    return ReturnValue(ur_bool_t{false});
   case UR_DEVICE_INFO_IMAGE_SRGB:
-    return ReturnValue(uint32_t{false});
+    return ReturnValue(ur_bool_t{false});
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
@@ -853,6 +853,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
         0)); //__read_write attribute currently undefinde in opencl
   }
   case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: {
+    return ReturnValue(static_cast<ur_bool_t>(true));
+  }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
     return ReturnValue(static_cast<uint32_t>(true));
   }
 
@@ -893,9 +896,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
       // can know if we are in (a) or (b) by checking if a tile is root device
       // or not.
       ur_device_handle_t URDev = Device->Platform->getDeviceFromNativeHandle(d);
-      if (URDev->isSubDevice())
+      if (URDev->isSubDevice()) {
         // We are in COMPOSITE mode, return an empty list.
-        return ReturnValue(0);
+        if (pSize) {
+          *pSize = 0;
+        }
+        return UR_RESULT_SUCCESS;
+      }
 
       Res.push_back(URDev);
     }
@@ -1078,7 +1085,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
   static const bool UseDriverInOrderLists = [] {
     const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
     if (!UrRet)
-      return true;
+      return false;
     return std::atoi(UrRet) != 0;
   }();
 
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index e472c2490c..615e887172 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -368,8 +368,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   }
 
   // Execute each command list so the barriers can be encountered.
-  for (ur_command_list_ptr_t &CmdList : CmdLists)
+  for (ur_command_list_ptr_t &CmdList : CmdLists) {
+    bool IsCopy =
+        CmdList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(Queue));
+    const auto &CommandBatch =
+        (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+    // Only batch if the matching CmdList is already open.
+    OkToBatch = CommandBatch.OpenCommandList == CmdList;
+
     UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
+  }
 
   UR_CALL(Queue->ActiveBarriers.clear());
   auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
@@ -472,8 +480,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 ) {
   std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
 
-  if (Event->UrQueue &&
-      (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
+  // The event must either have profiling enabled or be recording timestamps.
+  bool isTimestampedEvent = Event->isTimestamped();
+  if (!Event->isProfilingEnabled() && !isTimestampedEvent) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
@@ -486,6 +495,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 
   UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
 
+  // For timestamped events we have the timestamps ready directly on the event
+  // handle, so we short-circuit the return.
+  if (isTimestampedEvent) {
+    uint64_t ContextStartTime = Event->RecordEventStartTimestamp;
+    switch (PropName) {
+    case UR_PROFILING_INFO_COMMAND_QUEUED:
+    case UR_PROFILING_INFO_COMMAND_SUBMIT:
+      return ReturnValue(ContextStartTime);
+    case UR_PROFILING_INFO_COMMAND_END:
+    case UR_PROFILING_INFO_COMMAND_START: {
+      // If RecordEventEndTimestamp on the event is non-zero it means it has
+      // collected the result of the queue already. In that case it has been
+      // adjusted and is ready for immediate return.
+      if (Event->RecordEventEndTimestamp)
+        return ReturnValue(Event->RecordEventEndTimestamp);
+
+      // Otherwise we need to collect it from the queue.
+      auto Entry = Event->UrQueue->EndTimeRecordings.find(Event);
+
+      // Unexpected state if there is no end-time record.
+      if (Entry == Event->UrQueue->EndTimeRecordings.end())
+        return UR_RESULT_ERROR_UNKNOWN;
+      auto &EndTimeRecording = Entry->second;
+
+      // End time needs to be adjusted for resolution and valid bits.
+      uint64_t ContextEndTime =
+          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
+          ZeTimerResolution;
+
+      // If the result is 0, we have not yet gotten results back and so we just
+      // return it.
+      if (ContextEndTime == 0)
+        return ReturnValue(ContextEndTime);
+
+      // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+      // Note, it will not report correct time if there were multiple wrap
+      // arounds, and the longer term plan is to enlarge the capacity of the
+      // HW timestamps.
+      if (ContextEndTime < ContextStartTime)
+        ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+      // Now that we have the result, there is no need to keep it in the queue
+      // anymore, so we cache it on the event and evict the record from the
+      // queue.
+      Event->RecordEventEndTimestamp = ContextEndTime;
+      Event->UrQueue->EndTimeRecordings.erase(Entry);
+
+      return ReturnValue(ContextEndTime);
+    }
+    default:
+      logger::error("urEventGetProfilingInfo: not supported ParamName");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  }
+
   ze_kernel_timestamp_result_t tsResult;
 
   // A Command-buffer consists of three command-lists for which only a single
@@ -594,6 +658,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    bool Blocking,                ///< [in] blocking or non-blocking enqueue
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  ur_device_handle_t Device = Queue->Device;
+
+  bool UseCopyEngine = false;
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      /* AllowBatching */ false));
+
+  UR_CALL(createEventAndAssociateQueue(
+      Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList,
+      /* IsInternal */ false, /* HostVisible */ true));
+  ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent;
+  (*OutEvent)->WaitList = TmpWaitList;
+
+  uint64_t DeviceStartTimestamp = 0;
+  UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr));
+  (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp;
+
+  // Create a new entry in the queue's recordings.
+  Queue->EndTimeRecordings[*OutEvent] =
+      ur_queue_handle_t_::end_time_recording{};
+
+  ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp,
+             (CommandList->first,
+              &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp,
+              ZeEvent, (*OutEvent)->WaitList.Length,
+              (*OutEvent)->WaitList.ZeEventList));
+
+  UR_CALL(
+      Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
     ze_event_handle_t &ZeHostVisibleEvent) {
 
@@ -903,6 +1024,23 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
     Event->Context->addEventToContextCache(Event);
   }
 
+  // If the event was a timestamp recording, we try to evict its entry in the
+  // queue.
+  if (Event->isTimestamped()) {
+    auto Entry = Queue->EndTimeRecordings.find(Event);
+    if (Entry != Queue->EndTimeRecordings.end()) {
+      auto &EndTimeRecording = Entry->second;
+      if (EndTimeRecording.RecordEventEndTimestamp == 0) {
+        // If the end time recording has not finished, we tell the queue that
+        // the event is no longer alive to avoid invalid write-backs.
+        EndTimeRecording.EventHasDied = true;
+      } else {
+        // Otherwise we evict the entry.
+        Event->UrQueue->EndTimeRecordings.erase(Entry);
+      }
+    }
+  }
+
   // We intentionally incremented the reference counter when an event is
   // created so that we can avoid ur_queue_handle_t is released before the
   // associated ur_event_handle_t is released. Here we have to decrement it so
@@ -1289,16 +1427,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
         }
 
         auto Queue = EventList[I]->UrQueue;
-        if (Queue) {
-          // The caller of createAndRetainUrZeEventList must already hold
-          // a lock of the CurQueue. Additionally lock the Queue if it
-          // is different from CurQueue.
-          // TODO: rework this to avoid deadlock when another thread is
-          //       locking the same queues but in a different order.
-          auto Lock = ((Queue == CurQueue)
-                           ? std::unique_lock<ur_shared_mutex>()
-                           : std::unique_lock<ur_shared_mutex>(Queue->Mutex));
 
+        auto CurQueueDevice = CurQueue->Device;
+        std::optional<std::unique_lock<ur_shared_mutex>> QueueLock =
+            std::nullopt;
+        // The caller of createAndRetainUrZeEventList must already hold
+        // a lock of the CurQueue. However, if the CurQueue is different
+        // then the Event's Queue, we need to drop that lock and
+        // acquire the Event's Queue lock. This is done to avoid a lock
+        // ordering issue.
+        // For the rest of this scope, CurQueue cannot be accessed.
+        // TODO: This solution is very error-prone. This requires a refactor
+        // to either have fine-granularity locks inside of the queues or
+        // to move any operations on queues other than CurQueue out
+        // of this scope.
+        if (Queue && Queue != CurQueue) {
+          CurQueue->Mutex.unlock();
+          QueueLock = std::unique_lock<ur_shared_mutex>(Queue->Mutex);
+        }
+
+        if (Queue) {
           // If the event that is going to be waited is in an open batch
           // different from where this next command is going to be added,
           // then we have to force execute of that open command-list
@@ -1341,7 +1489,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
         }
 
         ur_command_list_ptr_t CommandList;
-        if (Queue && Queue->Device != CurQueue->Device) {
+        if (Queue && Queue->Device != CurQueueDevice) {
           // Get a command list prior to acquiring an event lock.
           // This prevents a potential deadlock with recursive
           // event locks.
@@ -1351,7 +1499,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
 
         std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
 
-        if (Queue && Queue->Device != CurQueue->Device &&
+        if (Queue && Queue->Device != CurQueueDevice &&
             !EventList[I]->IsMultiDevice) {
           ze_event_handle_t MultiDeviceZeEvent = nullptr;
           ur_event_handle_t MultiDeviceEvent;
@@ -1386,6 +1534,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
           this->UrEventList[TmpListLength]->RefCount.increment();
         }
 
+        if (QueueLock.has_value()) {
+          QueueLock.reset();
+          CurQueue->Mutex.lock();
+        }
         TmpListLength += 1;
       }
     }
@@ -1464,3 +1616,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const {
   return !UrQueue || // tentatively assume user events are profiling enabled
          (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
 }
+
+// Tells if this event was created as a timestamp event, allowing profiling
+// info even if profiling is not enabled.
+bool ur_event_handle_t_::isTimestamped() const {
+  // If we are recording, the start time of the event will be non-zero. The
+  // end time might still be missing, depending on whether the corresponding
+  // enqueue is still running.
+  return RecordEventStartTimestamp != 0;
+}
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index 08b4be7969..2d1f536e4e 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -207,6 +207,14 @@ struct ur_event_handle_t_ : _ur_object {
   // Indicates within creation of proxy event.
   bool IsCreatingHostProxyEvent = {false};
 
+  // Indicates the recorded start and end timestamps for the event. These are
+  // only set for events returned by timestamp recording enqueue functions.
+  // A non-zero value for RecordEventStartTimestamp indicates the event was the
+  // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it
+  // means the event has fetched the end-timestamp from the queue.
+  uint64_t RecordEventStartTimestamp = 0;
+  uint64_t RecordEventEndTimestamp = 0;
+
   // Besides each PI object keeping a total reference count in
   // _ur_object::RefCount we keep special track of the event *external*
   // references. This way we are able to tell when the event is not referenced
@@ -231,6 +239,10 @@ struct ur_event_handle_t_ : _ur_object {
   // Tells if this event is with profiling capabilities.
   bool isProfilingEnabled() const;
 
+  // Tells if this event was created as a timestamp event, allowing profiling
+  // info even if profiling is not enabled.
+  bool isTimestamped() const;
+
   // Get the host-visible event or create one and enqueue its signal.
   ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
 
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index ab2277d8b7..ad48962375 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1514,8 +1514,7 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) {
 }
 
 ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) {
-  if (!Event->IsMultiDevice && Event->UrQueue) {
-    auto Device = Event->UrQueue->Device;
+  if (!Event->IsMultiDevice) {
     auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0]
                                                  : &EventCachesDeviceMap[1];
     if (EventCachesMap->find(Device) == EventCachesMap->end()) {
@@ -1542,6 +1541,34 @@ ur_result_t ur_queue_handle_t_::active_barriers::clear() {
   return UR_RESULT_SUCCESS;
 }
 
+void ur_queue_handle_t_::clearEndTimeRecordings() {
+  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
+  const uint64_t TimestampMaxValue =
+      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
+
+  for (auto Entry : EndTimeRecordings) {
+    auto &Event = Entry.first;
+    auto &EndTimeRecording = Entry.second;
+    if (!Entry.second.EventHasDied) {
+      // Write the result back to the event if it is not dead.
+      uint64_t ContextEndTime =
+          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
+          ZeTimerResolution;
+
+      // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+      // Note, it will not report correct time if there were multiple wrap
+      // arounds, and the longer term plan is to enlarge the capacity of the
+      // HW timestamps.
+      if (ContextEndTime < Event->RecordEventStartTimestamp)
+        ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+      // Store it in the event.
+      Event->RecordEventEndTimestamp = ContextEndTime;
+    }
+  }
+  EndTimeRecordings.clear();
+}
+
 ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
 
@@ -1567,6 +1594,8 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
           }
   }
 
+  Queue->clearEndTimeRecordings();
+
   logger::debug("urQueueRelease(compute) NumTimesClosedFull {}, "
                 "NumTimesClosedEarly {}",
                 UrQueue->ComputeCommandBatch.NumTimesClosedFull,
@@ -1720,6 +1749,11 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     LastCommandEvent = nullptr;
   }
 
+  // Since all timestamp recordings should have finished with the
+  // synchronizations, we can clear the map and write the results to the owning
+  // events.
+  clearEndTimeRecordings();
+
   // With the entire queue synchronized, the active barriers must be done so we
   // can remove them.
   if (auto Res = ActiveBarriers.clear())
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index ed832148ac..799e90e9d9 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -486,6 +486,22 @@ struct ur_queue_handle_t_ : _ur_object {
   std::vector<std::unordered_map<ur_device_handle_t, size_t>>
       EventCachesDeviceMap{2};
 
+  // End-times enqueued are stored on the queue rather than on the event to
+  // avoid the event objects having been destroyed prior to the write to the
+  // end-time member.
+  struct end_time_recording {
+    // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as
+    // it is written asynchronously.
+    uint64_t RecordEventEndTimestamp = 0;
+    // The event may die before the recording has been written back. In this
+    // case the event will mark this for deletion when the queue sees fit.
+    bool EventHasDied = false;
+  };
+  std::map<ur_event_handle_t, end_time_recording> EndTimeRecordings;
+
+  // Clear the end time recording timestamps entries.
+  void clearEndTimeRecordings();
+
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
   // For copy commands, IsCopy is set to 'true'.
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index 051db73145..d6d6060ea6 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -456,6 +456,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index c4cbfc9d26..19d31bcb9b 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -171,7 +171,7 @@ static ur_result_t USMAllocationMakeResident(
 static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
                                       ur_context_handle_t Context,
                                       ur_device_handle_t Device,
-                                      ur_usm_device_mem_flags_t *Flags,
+                                      ur_usm_device_mem_flags_t Flags,
                                       size_t Size, uint32_t Alignment) {
   std::ignore = Flags;
   // TODO: translate PI properties to Level Zero flags
@@ -213,12 +213,10 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
   return UR_RESULT_SUCCESS;
 }
 
-static ur_result_t USMSharedAllocImpl(void **ResultPtr,
-                                      ur_context_handle_t Context,
-                                      ur_device_handle_t Device,
-                                      ur_usm_host_mem_flags_t *,
-                                      ur_usm_device_mem_flags_t *, size_t Size,
-                                      uint32_t Alignment) {
+static ur_result_t
+USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                   ur_device_handle_t Device, ur_usm_host_mem_flags_t,
+                   ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) {
 
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
@@ -263,7 +261,7 @@ static ur_result_t USMSharedAllocImpl(void **ResultPtr,
 
 static ur_result_t USMHostAllocImpl(void **ResultPtr,
                                     ur_context_handle_t Context,
-                                    ur_usm_host_mem_flags_t *Flags, size_t Size,
+                                    ur_usm_host_mem_flags_t Flags, size_t Size,
                                     uint32_t Alignment) {
   std::ignore = Flags;
   // TODO: translate PI properties to Level Zero flags
@@ -767,8 +765,8 @@ umf_result_t L0MemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t L0SharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                  uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr,
@@ -776,20 +774,19 @@ ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr,
                                                          uint32_t Alignment) {
   ur_usm_device_desc_t UsmDeviceDesc{};
   UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
-  ur_usm_host_desc_t UsmHostDesc{};
-  return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags,
-                            &UsmHostDesc.flags, Size, Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, UsmDeviceDesc.flags,
+                            /*host flags*/ 0, Size, Alignment);
 }
 
 ur_result_t L0DeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                  uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t L0HostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 1babdb0f10..e7a3e8adf1 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -321,6 +321,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
     return ReturnValue(false);
 
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+    return ReturnValue(false);
   default:
     DIE_NO_IMPLEMENTATION;
   }
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 112bb553c0..9049e3c1b6 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -87,3 +87,15 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus,
 
   DIE_NO_IMPLEMENTATION;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = hQueue;
+  std::ignore = blocking;
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+
+  DIE_NO_IMPLEMENTATION;
+}
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 1f54a98c80..065012613e 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -392,6 +392,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   }
 
   pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
index 37d0ceb15a..a004386baa 100644
--- a/source/adapters/null/ur_nullddi.cpp
+++ b/source/adapters/null/ur_nullddi.cpp
@@ -5513,6 +5513,47 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // if the driver has created a custom function, then call it instead of using the generic path
+    auto pfnTimestampRecordingExp =
+        d_context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr != pfnTimestampRecordingExp) {
+        result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+    } else {
+        // generic implementation
+        *phEvent = reinterpret_cast<ur_event_handle_t>(d_context.get());
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -6069,6 +6110,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         driver::urEnqueueCooperativeKernelLaunchExp;
 
+    pDdiTable->pfnTimestampRecordingExp =
+        driver::urEnqueueTimestampRecordingExp;
+
     return result;
 } catch (...) {
     return exceptionToResult(std::current_exception());
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index d89a9492a5..ac79b71876 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -507,7 +507,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
           cl_adapter::cast<cl_device_id>(hDevice), {"cl_khr_fp16"}, Supported));
 
       if (!Supported) {
-        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
       }
     }
 
@@ -799,6 +799,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: {
     return ReturnValue(false);
   }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+    return ReturnValue(false);
+  }
   case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: {
     bool Supported = false;
     CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp
index f5af30734c..5141ce8ff0 100644
--- a/source/adapters/opencl/event.cpp
+++ b/source/adapters/opencl/event.cpp
@@ -257,3 +257,9 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus,
                                           CallbackType, ClCallback, Callback));
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t,
+                               const ur_event_handle_t *, ur_event_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index 8c2c73d7c8..effb2128c3 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -398,6 +398,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index 9cecdbec1e..f588a21a2e 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -294,7 +294,7 @@ template <typename T> struct stype_map {};
 // stype_map_impl<UR_STRUCTURE_TYPE_USM_DEVICE_DESC> {};
 #include "stype_map_helpers.def"
 
-template <typename T> constexpr int as_stype() { return stype_map<T>::value; };
+template <typename T> constexpr int as_stype() { return stype_map<T>::value; }
 
 /// Walk a generic UR linked list looking for a node of the given type. If it's
 /// found, its address is returned, othewise `nullptr`. e.g. to find out whether
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index c55d752410..b05d7c6348 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -22,8 +22,6 @@ namespace ur_sanitizer_layer {
 
 namespace {
 
-constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem";
-
 uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) {
     return USM_SHADOW_BASE + (UPtr >> 3);
 }
@@ -348,11 +346,14 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context,
 
 ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
                                                   ur_queue_handle_t Queue,
-                                                  LaunchInfo &LaunchInfo) {
+                                                  USMLaunchInfo &LaunchInfo) {
     auto Context = GetContext(Queue);
     auto Device = GetDevice(Queue);
     auto ContextInfo = getContextInfo(Context);
     auto DeviceInfo = getDeviceInfo(Device);
+    auto KernelInfo = getKernelInfo(Kernel);
+
+    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
 
     ManagedQueue InternalQueue(Context, Device);
     if (!InternalQueue) {
@@ -370,23 +371,12 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
 
 ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
                                                    ur_queue_handle_t Queue,
-                                                   ur_event_handle_t &Event,
-                                                   LaunchInfo &LaunchInfo) {
-    auto Program = GetProgram(Kernel);
-    ur_event_handle_t ReadEvent{};
-
-    // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it
-    // to host, but it's okay that it isn't defined
+                                                   USMLaunchInfo &LaunchInfo) {
     // FIXME: We must use block operation here, until we support urEventSetCallback
-    auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead(
-        Queue, Program, kSPIR_DeviceSanitizerReportMem, true,
-        sizeof(LaunchInfo.SPIR_DeviceSanitizerReportMem), 0,
-        &LaunchInfo.SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent);
+    auto Result = context.urDdiTable.Queue.pfnFinish(Queue);
 
     if (Result == UR_RESULT_SUCCESS) {
-        Event = ReadEvent;
-
-        const auto &AH = LaunchInfo.SPIR_DeviceSanitizerReportMem;
+        const auto &AH = LaunchInfo.Data->SanitizerReport;
         if (!AH.Flag) {
             return UR_RESULT_SUCCESS;
         }
@@ -627,13 +617,44 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
+        return UR_RESULT_SUCCESS;
+    }
+    m_KernelMap.emplace(Kernel, std::make_shared<KernelInfo>(Kernel));
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+    m_KernelMap.erase(Kernel);
+    return UR_RESULT_SUCCESS;
+}
+
 ur_result_t SanitizerInterceptor::prepareLaunch(
     ur_context_handle_t Context, std::shared_ptr<DeviceInfo> &DeviceInfo,
     ur_queue_handle_t Queue, ur_kernel_handle_t Kernel,
-    LaunchInfo &LaunchInfo) {
+    USMLaunchInfo &LaunchInfo) {
     auto Program = GetProgram(Kernel);
 
     do {
+        // Set launch info argument
+        auto ArgNums = GetKernelNumArgs(Kernel);
+        if (ArgNums) {
+            context.logger.debug(
+                "launch_info {} (numLocalArgs={}, localArgs={})",
+                (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs,
+                (void *)LaunchInfo.Data->LocalArgs);
+            ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer(
+                Kernel, ArgNums - 1, nullptr, &LaunchInfo.Data);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error("Failed to set launch info: {}", URes);
+                return URes;
+            }
+        }
+
         // Write global variable to program
         auto EnqueueWriteGlobal = [Queue, Program](const char *Name,
                                                    const void *Value,
@@ -723,15 +744,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                     "LocalShadowMemorySize={})",
                     NumWG, LocalMemorySize, LocalShadowMemorySize);
 
-                UR_CALL(EnqueueAllocateDevice(LocalShadowMemorySize,
-                                              LaunchInfo.LocalShadowOffset));
+                UR_CALL(EnqueueAllocateDevice(
+                    LocalShadowMemorySize, LaunchInfo.Data->LocalShadowOffset));
 
-                LaunchInfo.LocalShadowOffsetEnd =
-                    LaunchInfo.LocalShadowOffset + LocalShadowMemorySize - 1;
+                LaunchInfo.Data->LocalShadowOffsetEnd =
+                    LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize -
+                    1;
 
-                context.logger.info("ShadowMemory(Local, {} - {})",
-                                    (void *)LaunchInfo.LocalShadowOffset,
-                                    (void *)LaunchInfo.LocalShadowOffsetEnd);
+                context.logger.info(
+                    "ShadowMemory(Local, {} - {})",
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
     } while (false);
@@ -749,15 +772,61 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) {
     return --It;
 }
 
-LaunchInfo::~LaunchInfo() {
+ur_result_t USMLaunchInfo::initialize() {
+    UR_CALL(context.urDdiTable.Context.pfnRetain(Context));
+    UR_CALL(context.urDdiTable.Device.pfnRetain(Device));
+    UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
+        Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data));
+    *Data = LaunchInfo{};
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) {
+    auto NumArgs = KI.LocalArgs.size();
+    if (NumArgs) {
+        Data->NumLocalArgs = NumArgs;
+        UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
+            Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs,
+            (void **)&Data->LocalArgs));
+        uint32_t i = 0;
+        for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) {
+            Data->LocalArgs[i++] = ArgInfo;
+            context.logger.debug(
+                "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex,
+                ArgInfo.Size, ArgInfo.SizeWithRedZone);
+        }
+    }
+    return UR_RESULT_SUCCESS;
+}
+
+USMLaunchInfo::~USMLaunchInfo() {
     [[maybe_unused]] ur_result_t Result;
-    if (LocalShadowOffset) {
-        Result =
-            context.urDdiTable.USM.pfnFree(Context, (void *)LocalShadowOffset);
+    if (Data) {
+        auto Type = GetDeviceType(Device);
+        if (Type == DeviceType::GPU_PVC) {
+            if (Data->PrivateShadowOffset) {
+                Result = context.urDdiTable.USM.pfnFree(
+                    Context, (void *)Data->PrivateShadowOffset);
+                assert(Result == UR_RESULT_SUCCESS);
+            }
+            if (Data->LocalShadowOffset) {
+                Result = context.urDdiTable.USM.pfnFree(
+                    Context, (void *)Data->LocalShadowOffset);
+                assert(Result == UR_RESULT_SUCCESS);
+            }
+        }
+        if (Data->LocalArgs) {
+            Result = context.urDdiTable.USM.pfnFree(Context,
+                                                    (void *)Data->LocalArgs);
+            assert(Result == UR_RESULT_SUCCESS);
+        }
+        Result = context.urDdiTable.USM.pfnFree(Context, (void *)Data);
         assert(Result == UR_RESULT_SUCCESS);
     }
     Result = context.urDdiTable.Context.pfnRelease(Context);
     assert(Result == UR_RESULT_SUCCESS);
+    Result = context.urDdiTable.Device.pfnRelease(Device);
+    assert(Result == UR_RESULT_SUCCESS);
 }
 
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index a691bee7b7..1a699df1f6 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -79,6 +79,26 @@ struct QueueInfo {
     }
 };
 
+struct KernelInfo {
+    ur_kernel_handle_t Handle;
+
+    ur_shared_mutex Mutex;
+    // Need preserve the order of local arguments
+    std::map<uint32_t, LocalArgsInfo> LocalArgs;
+
+    explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) {
+        [[maybe_unused]] auto Result =
+            context.urDdiTable.Kernel.pfnRetain(Kernel);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~KernelInfo() {
+        [[maybe_unused]] auto Result =
+            context.urDdiTable.Kernel.pfnRelease(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+};
+
 struct ContextInfo {
     ur_context_handle_t Handle;
 
@@ -107,31 +127,30 @@ struct ContextInfo {
     }
 };
 
-struct LaunchInfo {
-    uptr LocalShadowOffset = 0;
-    uptr LocalShadowOffsetEnd = 0;
-    DeviceSanitizerReport SPIR_DeviceSanitizerReportMem;
+struct USMLaunchInfo {
+    LaunchInfo *Data;
 
     ur_context_handle_t Context = nullptr;
+    ur_device_handle_t Device = nullptr;
     const size_t *GlobalWorkSize = nullptr;
     const size_t *GlobalWorkOffset = nullptr;
     std::vector<size_t> LocalWorkSize;
     uint32_t WorkDim = 0;
 
-    LaunchInfo(ur_context_handle_t Context, const size_t *GlobalWorkSize,
-               const size_t *LocalWorkSize, const size_t *GlobalWorkOffset,
-               uint32_t WorkDim)
-        : Context(Context), GlobalWorkSize(GlobalWorkSize),
+    USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
+                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                  const size_t *GlobalWorkOffset, uint32_t WorkDim)
+        : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize),
           GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) {
-        [[maybe_unused]] auto Result =
-            context.urDdiTable.Context.pfnRetain(Context);
-        assert(Result == UR_RESULT_SUCCESS);
         if (LocalWorkSize) {
             this->LocalWorkSize =
                 std::vector<size_t>(LocalWorkSize, LocalWorkSize + WorkDim);
         }
     }
-    ~LaunchInfo();
+    ~USMLaunchInfo();
+
+    ur_result_t initialize();
+    ur_result_t updateKernelInfo(const KernelInfo &KI);
 };
 
 struct DeviceGlobalInfo {
@@ -158,12 +177,11 @@ class SanitizerInterceptor {
 
     ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel,
                                 ur_queue_handle_t Queue,
-                                LaunchInfo &LaunchInfo);
+                                USMLaunchInfo &LaunchInfo);
 
     ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel,
                                  ur_queue_handle_t Queue,
-                                 ur_event_handle_t &Event,
-                                 LaunchInfo &LaunchInfo);
+                                 USMLaunchInfo &LaunchInfo);
 
     ur_result_t insertContext(ur_context_handle_t Context,
                               std::shared_ptr<ContextInfo> &CI);
@@ -173,6 +191,9 @@ class SanitizerInterceptor {
                              std::shared_ptr<DeviceInfo> &CI);
     ur_result_t eraseDevice(ur_device_handle_t Device);
 
+    ur_result_t insertKernel(ur_kernel_handle_t Kernel);
+    ur_result_t eraseKernel(ur_kernel_handle_t Kernel);
+
     std::optional<AllocationIterator> findAllocInfoByAddress(uptr Address);
 
     std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
@@ -181,6 +202,18 @@ class SanitizerInterceptor {
         return m_ContextMap[Context];
     }
 
+    std::shared_ptr<DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
+        std::shared_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
+        assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
+        return m_DeviceMap[Device];
+    }
+
+    std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
+        std::shared_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+        assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+        return m_KernelMap[Kernel];
+    }
+
   private:
     ur_result_t updateShadowMemory(std::shared_ptr<ContextInfo> &ContextInfo,
                                    std::shared_ptr<DeviceInfo> &DeviceInfo,
@@ -195,26 +228,23 @@ class SanitizerInterceptor {
                               std::shared_ptr<DeviceInfo> &DeviceInfo,
                               ur_queue_handle_t Queue,
                               ur_kernel_handle_t Kernel,
-                              LaunchInfo &LaunchInfo);
+                              USMLaunchInfo &LaunchInfo);
 
     ur_result_t allocShadowMemory(ur_context_handle_t Context,
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
-    std::shared_ptr<DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
-        std::shared_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
-        assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
-        return m_DeviceMap[Device];
-    }
-
   private:
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;
     ur_shared_mutex m_ContextMapMutex;
-
     std::unordered_map<ur_device_handle_t, std::shared_ptr<DeviceInfo>>
         m_DeviceMap;
     ur_shared_mutex m_DeviceMapMutex;
 
+    std::unordered_map<ur_kernel_handle_t, std::shared_ptr<KernelInfo>>
+        m_KernelMap;
+    ur_shared_mutex m_KernelMapMutex;
+
     /// Assumption: all USM chunks are allocated in one VA
     AllocationMap m_AllocationMap;
     ur_shared_mutex m_AllocationMapMutex;
diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp
index 46ddee4423..1c8ef24a9d 100644
--- a/source/loader/layers/sanitizer/asan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/asan_libdevice.hpp
@@ -62,6 +62,23 @@ struct DeviceSanitizerReport {
     bool IsRecover = false;
 };
 
+struct LocalArgsInfo {
+    uint64_t Size = 0;
+    uint64_t SizeWithRedZone = 0;
+};
+
+struct LaunchInfo {
+    uintptr_t PrivateShadowOffset =
+        0; // don't move this field, we use it in AddressSanitizerPass
+
+    uintptr_t LocalShadowOffset = 0;
+    uintptr_t LocalShadowOffsetEnd = 0;
+    DeviceSanitizerReport SanitizerReport;
+
+    uint32_t NumLocalArgs = 0;
+    LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex
+};
+
 constexpr unsigned ASAN_SHADOW_SCALE = 3;
 constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE;
 
diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp
index d5612100aa..1d43c512da 100644
--- a/source/loader/layers/sanitizer/common.hpp
+++ b/source/loader/layers/sanitizer/common.hpp
@@ -65,6 +65,41 @@ inline constexpr uptr ComputeRZLog(uptr user_requested_size) {
     return rz_log;
 }
 
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// Examples:
+/// \code
+///   alignTo(5, 8) = 8
+///   alignTo(17, 8) = 24
+///   alignTo(~0LL, 8) = 0
+///   alignTo(321, 255) = 510
+/// \endcode
+inline uint64_t AlignTo(uint64_t Value, uint64_t Align) {
+    assert(Align != 0u && "Align can't be 0.");
+    return (Value + Align - 1) / Align * Align;
+}
+
+inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size,
+                                              uint64_t Granularity,
+                                              uint64_t Alignment) {
+    uint64_t Res = 0;
+    if (Size <= 4) {
+        Res = 16;
+    } else if (Size <= 16) {
+        Res = 32;
+    } else if (Size <= 128) {
+        Res = Size + 32;
+    } else if (Size <= 512) {
+        Res = Size + 64;
+    } else if (Size <= 4096) {
+        Res = Size + 128;
+    } else {
+        Res = Size + 256;
+    }
+    return AlignTo(std::max(Res, 2 * Granularity), Alignment);
+}
+
 // ================================================================
 
 // Trace an internal UR call; returns in case of an error.
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 53ce5d1c1e..58f54c9338 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -272,8 +272,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     context.logger.debug("==== urEnqueueKernelLaunch");
 
-    LaunchInfo LaunchInfo(GetContext(hQueue), pGlobalWorkSize, pLocalWorkSize,
-                          pGlobalWorkOffset, workDim);
+    USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
+                             pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
+                             workDim);
+    UR_CALL(LaunchInfo.initialize());
 
     UR_CALL(context.interceptor->preLaunchKernel(hKernel, hQueue, LaunchInfo));
 
@@ -283,8 +285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
         pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent);
 
     if (result == UR_RESULT_SUCCESS) {
-        UR_CALL(context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent,
-                                                      LaunchInfo));
+        UR_CALL(
+            context.interceptor->postLaunchKernel(hKernel, hQueue, LaunchInfo));
     }
 
     if (phEvent) {
@@ -374,6 +376,90 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelCreate
+__urdlllocal ur_result_t UR_APICALL urKernelCreate(
+    ur_program_handle_t hProgram, ///< [in] handle of the program instance
+    const char *pKernelName,      ///< [in] pointer to null-terminated string.
+    ur_kernel_handle_t
+        *phKernel ///< [out] pointer to handle of kernel object created.
+) {
+    auto pfnCreate = context.urDdiTable.Kernel.pfnCreate;
+
+    if (nullptr == pfnCreate) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelCreate");
+
+    UR_CALL(pfnCreate(hProgram, pKernelName, phKernel));
+    UR_CALL(context.interceptor->insertKernel(*phKernel));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelRelease
+__urdlllocal ur_result_t urKernelRelease(
+    ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release
+) {
+    auto pfnRelease = context.urDdiTable.Kernel.pfnRelease;
+
+    if (nullptr == pfnRelease) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelRelease");
+    UR_CALL(pfnRelease(hKernel));
+
+    if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) {
+        uint32_t RefCount;
+        UR_CALL(context.urDdiTable.Kernel.pfnGetInfo(
+            hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount),
+            &RefCount, nullptr));
+        if (RefCount == 1) {
+            UR_CALL(context.interceptor->eraseKernel(hKernel));
+        }
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgLocal
+__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t
+        argSize, ///< [in] size of the local buffer to be allocated by the runtime
+    const ur_kernel_arg_local_properties_t
+        *pProperties ///< [in][optional] pointer to local buffer properties.
+) {
+    auto pfnSetArgLocal = context.urDdiTable.Kernel.pfnSetArgLocal;
+
+    if (nullptr == pfnSetArgLocal) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelSetArgLocal (argIndex={}, argSize={})",
+                         argIndex, argSize);
+
+    {
+        auto KI = context.interceptor->getKernelInfo(hKernel);
+        std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
+        // TODO: get local variable alignment
+        auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal(
+            argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY);
+        KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ};
+        argSize = argSizeWithRZ;
+    }
+
+    ur_result_t result =
+        pfnSetArgLocal(hKernel, argIndex, argSize, pProperties);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Context table
 ///        with current process' addresses
@@ -470,6 +556,38 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Kernel table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_kernel_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease;
+    pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal;
+
+    return result;
+}
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Enqueue table
 ///        with current process' addresses
@@ -570,6 +688,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->Context);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetKernelProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Kernel);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_sanitizer_layer::urGetProgramProcAddrTable(
             UR_API_VERSION_CURRENT, &dditable->Program);
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 771b46c0c0..6b7be288a3 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -6121,6 +6121,52 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    auto pfnTimestampRecordingExp =
+        context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_enqueue_timestamp_recording_exp_params_t params = {
+        &hQueue, &blocking, &numEventsInWaitList, &phEventWaitList, &phEvent};
+    uint64_t instance =
+        context.notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP,
+                             "urEnqueueTimestampRecordingExp", &params);
+
+    ur_result_t result = pfnTimestampRecordingExp(
+        hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent);
+
+    context.notify_end(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP,
+                       "urEnqueueTimestampRecordingExp", &params, &result,
+                       instance);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -6828,6 +6874,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp;
 
+    dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp;
+    pDdiTable->pfnTimestampRecordingExp =
+        ur_tracing_layer::urEnqueueTimestampRecordingExp;
+
     return result;
 }
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 7939ca21b9..cc11494804 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName) {
+        if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
@@ -8936,6 +8936,65 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    auto pfnTimestampRecordingExp =
+        context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (context.enableParameterValidation) {
+        if (NULL == hQueue) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == phEvent) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (phEventWaitList != NULL && numEventsInWaitList > 0) {
+            for (uint32_t i = 0; i < numEventsInWaitList; ++i) {
+                if (phEventWaitList[i] == NULL) {
+                    return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+                }
+            }
+        }
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hQueue)) {
+        refCountContext.logInvalidReference(hQueue);
+    }
+
+    ur_result_t result = pfnTimestampRecordingExp(
+        hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -9745,6 +9804,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         ur_validation_layer::urEnqueueCooperativeKernelLaunchExp;
 
+    dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp;
+    pDdiTable->pfnTimestampRecordingExp =
+        ur_validation_layer::urEnqueueTimestampRecordingExp;
+
     return result;
 }
 
diff --git a/source/loader/ur_adapter_registry.hpp b/source/loader/ur_adapter_registry.hpp
index 3cfac34647..060a5ae8a9 100644
--- a/source/loader/ur_adapter_registry.hpp
+++ b/source/loader/ur_adapter_registry.hpp
@@ -114,11 +114,13 @@ class AdapterRegistry {
     // to load the adapter.
     std::vector<std::vector<fs::path>> adaptersLoadPaths;
 
-    static constexpr std::array<const char *, 4> knownAdapterNames{
+    static constexpr std::array<const char *, 5> knownAdapterNames{
         MAKE_LIBRARY_NAME("ur_adapter_level_zero", "0"),
-        MAKE_LIBRARY_NAME("ur_adapter_hip", "0"),
         MAKE_LIBRARY_NAME("ur_adapter_opencl", "0"),
-        MAKE_LIBRARY_NAME("ur_adapter_cuda", "0")};
+        MAKE_LIBRARY_NAME("ur_adapter_cuda", "0"),
+        MAKE_LIBRARY_NAME("ur_adapter_hip", "0"),
+        MAKE_LIBRARY_NAME("ur_adapter_native_cpu", "0"),
+    };
 
     std::optional<std::vector<fs::path>> getEnvAdapterSearchPaths() {
         std::optional<std::vector<std::string>> pathStringsOpt;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index ffb7eec027..72e8014fe3 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -7653,6 +7653,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_queue_object_t *>(hQueue)->dditable;
+    auto pfnTimestampRecordingExp =
+        dditable->ur.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hQueue = reinterpret_cast<ur_queue_object_t *>(hQueue)->handle;
+
+    // convert loader handles to platform handles
+    auto phEventWaitListLocal =
+        std::vector<ur_event_handle_t>(numEventsInWaitList);
+    for (size_t i = 0; i < numEventsInWaitList; ++i) {
+        phEventWaitListLocal[i] =
+            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
+    }
+
+    // forward to device-platform
+    result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                      phEventWaitListLocal.data(), phEvent);
+
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        // convert platform handle to loader handle
+        *phEvent = reinterpret_cast<ur_event_handle_t>(
+            ur_event_factory.getInstance(*phEvent, dditable));
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -8373,6 +8436,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
             // return pointers to loader's DDIs
             pDdiTable->pfnCooperativeKernelLaunchExp =
                 ur_loader::urEnqueueCooperativeKernelLaunchExp;
+            pDdiTable->pfnTimestampRecordingExp =
+                ur_loader::urEnqueueTimestampRecordingExp;
         } else {
             // return pointers directly to platform's DDIs
             *pDdiTable =
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index dba668e61b..baa8173976 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -8315,6 +8315,52 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+    ) try {
+    auto pfnTimestampRecordingExp =
+        ur_lib::context->urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                    phEventWaitList, phEvent);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Produces an executable program from one program, negates need for the
 ///        linking step.
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 5721ebf3ef..3b144c87ad 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1653,6 +1653,14 @@ ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintEnqueueTimestampRecordingExpParams(
+    const struct ur_enqueue_timestamp_recording_exp_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t
 urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params,
                           char *buffer, const size_t buff_size,
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 7f4746fcb7..d21d81411f 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -7034,6 +7034,44 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Produces an executable program from one program, negates need for the
 ///        linking step.
diff --git a/test/adapters/CMakeLists.txt b/test/adapters/CMakeLists.txt
index 5eff6e357a..4ce7d2505d 100644
--- a/test/adapters/CMakeLists.txt
+++ b/test/adapters/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt
index 45d68594ed..678d26bfe8 100644
--- a/test/adapters/level_zero/CMakeLists.txt
+++ b/test/adapters/level_zero/CMakeLists.txt
@@ -25,3 +25,22 @@ else()
     add_dependencies(test-adapter-level_zero
         generate_device_binaries kernel_names_header)
 endif()
+
+if(LINUX)
+    # Make L0 use CallMap from a seprate shared lib so that we can access the map
+    # from the tests. This only seems to work on linux
+    add_library(zeCallMap SHARED zeCallMap.cpp)
+    target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS)
+    target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap)
+
+    add_adapter_test(level_zero
+        FIXTURE DEVICES
+        SOURCES
+            event_cache_tests.cpp
+        ENVIRONMENT
+            "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
+            "UR_L0_LEAKS_DEBUG=1"
+    )
+
+    target_link_libraries(test-adapter-level_zero PRIVATE zeCallMap)
+endif()
diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp
new file mode 100644
index 0000000000..53bc39ad96
--- /dev/null
+++ b/test/adapters/level_zero/event_cache_tests.cpp
@@ -0,0 +1,163 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "ur_print.hpp"
+#include "uur/fixtures.h"
+#include "uur/raii.h"
+
+#include <map>
+#include <string>
+
+template <typename... Args> auto combineFlags(std::tuple<Args...> tuple) {
+    return std::apply([](auto... args) { return (... |= args); }, tuple);
+}
+
+extern std::map<std::string, int> *ZeCallCount;
+
+using FlagsTupleType = std::tuple<ur_queue_flags_t, ur_queue_flags_t,
+                                  ur_queue_flags_t, ur_queue_flags_t>;
+
+struct urEventCacheTest : uur::urContextTestWithParam<FlagsTupleType> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp());
+
+        flags = combineFlags(getParam());
+
+        ur_queue_properties_t props;
+        props.flags = flags;
+        ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue));
+        ASSERT_NE(queue, nullptr);
+
+        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
+                                         nullptr, &buffer));
+
+        (*ZeCallCount)["zeEventCreate"] = 0;
+        (*ZeCallCount)["zeEventDestroy"] = 0;
+    }
+
+    void TearDown() override {
+        if (buffer) {
+            EXPECT_SUCCESS(urMemRelease(buffer));
+        }
+        if (queue) {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueRelease(queue));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown());
+    }
+
+    auto enqueueWork(ur_event_handle_t *hEvent, int data) {
+        input.assign(count, data);
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferWrite(
+            queue, buffer, false, 0, size, input.data(), 0, nullptr, hEvent));
+    }
+
+    void verifyData() {
+        std::vector<uint32_t> output(count, 1);
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferRead(
+            queue, buffer, true, 0, size, output.data(), 0, nullptr, nullptr));
+
+        if (!(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+            ASSERT_EQ(input, output);
+        }
+    }
+
+    const size_t count = 1024;
+    const size_t size = sizeof(uint32_t) * count;
+    ur_mem_handle_t buffer = nullptr;
+    ur_queue_handle_t queue = nullptr;
+    std::vector<uint32_t> input;
+    ur_queue_flags_t flags;
+};
+
+TEST_P(urEventCacheTest, eventsReuseNoVisibleEvent) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+
+    for (int i = 0; i < numIters; i++) {
+        for (int j = 0; j < numEnqueues; j++) {
+            enqueueWork(nullptr, i * numEnqueues + j);
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+        verifyData();
+    }
+
+    // TODO: why events are not reused for UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE?
+    if ((flags & UR_QUEUE_FLAG_DISCARD_EVENTS) &&
+        !(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+        ASSERT_EQ((*ZeCallCount)["zeEventCreate"], 2);
+    } else {
+        ASSERT_GE((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+    }
+}
+
+TEST_P(urEventCacheTest, eventsReuseWithVisibleEvent) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+
+    for (int i = 0; i < numIters; i++) {
+        std::vector<uur::raii::Event> events(numEnqueues);
+        for (int j = 0; j < numEnqueues; j++) {
+            enqueueWork(events[j].ptr(), i * numEnqueues + j);
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+        verifyData();
+    }
+
+    ASSERT_LT((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+}
+
+TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+    static constexpr int waitEveryN = 16;
+
+    for (int i = 0; i < numIters; i++) {
+        std::vector<uur::raii::Event> events;
+        for (int j = 0; j < numEnqueues; j++) {
+            events.emplace_back();
+            enqueueWork(events.back().ptr(), i * numEnqueues + j);
+
+            if (j > 0 && j % waitEveryN == 0) {
+                ASSERT_SUCCESS(urEventWait(waitEveryN,
+                                           (ur_event_handle_t *)events.data()));
+                verifyData();
+                events.clear();
+            }
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+    }
+
+    ASSERT_GE((*ZeCallCount)["zeEventCreate"], waitEveryN);
+    // TODO: why there are more events than this?
+    // ASSERT_LE((*ZeCallCount)["zeEventCreate"],  waitEveryN * 2 + 2);
+}
+
+template <typename T>
+inline std::string
+printFlags(const testing::TestParamInfo<typename T::ParamType> &info) {
+    const auto device_handle = std::get<0>(info.param);
+    const auto platform_device_name =
+        uur::GetPlatformAndDeviceName(device_handle);
+    auto flags = combineFlags(std::get<1>(info.param));
+
+    std::stringstream ss;
+    ur::details::printFlag<ur_queue_flag_t>(ss, flags);
+
+    auto str = ss.str();
+    std::replace(str.begin(), str.end(), ' ', '_');
+    std::replace(str.begin(), str.end(), '|', '_');
+    return platform_device_name + "__" + str;
+}
+
+UUR_TEST_SUITE_P(
+    urEventCacheTest,
+    ::testing::Combine(
+        testing::Values(0, UR_QUEUE_FLAG_DISCARD_EVENTS),
+        testing::Values(0, UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+        // TODO: why the test fails with UR_QUEUE_FLAG_SUBMISSION_BATCHED?
+        testing::Values(
+            UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE /*, UR_QUEUE_FLAG_SUBMISSION_BATCHED */),
+        testing::Values(0, UR_QUEUE_FLAG_PROFILING_ENABLE)),
+    printFlags<urEventCacheTest>);
diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp
new file mode 100644
index 0000000000..3c6487f36d
--- /dev/null
+++ b/test/adapters/level_zero/zeCallMap.cpp
@@ -0,0 +1,12 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <map>
+#include <string>
+
+// Map used by L0 adapter to count the number of calls to each L0 function
+// Lifetime is managed by the adapter, this variable is defined here
+// only so that we can read it from the tests.
+std::map<std::string, int> *ZeCallCount = nullptr;
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index a11cc82932..31048cce28 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -33,7 +33,7 @@ function(add_test_adapter name adapter)
     endif()
 
     set(TEST_ENV UR_ADAPTERS_FORCE_LOAD="$<TARGET_FILE:ur_${adapter}>")
-    if(NOT UR_CONFORMANCE_TEST_DIR)
+    if(UR_CONFORMANCE_ENABLE_MATCH_FILES)
         list(APPEND TEST_ENV GTEST_COLOR=no)
     endif()
     set_tests_properties(${TEST_NAME} PROPERTIES
@@ -135,6 +135,7 @@ if(UR_DPCXX)
     add_subdirectory(kernel)
     add_subdirectory(program)
     add_subdirectory(enqueue)
+    add_subdirectory(integration)
     add_subdirectory(exp_command_buffer)
     add_subdirectory(exp_usm_p2p)
 else()
diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match
index b1bff7376c..96a29ca0f5 100644
--- a/test/conformance/device/device_adapter_level_zero.match
+++ b/test/conformance/device/device_adapter_level_zero.match
@@ -1,18 +1,9 @@
-{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ENDIAN_LITTLE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_COMPILER_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_LINKER_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IMAGE_SRGB
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_64
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP
diff --git a/test/conformance/device/device_adapter_opencl.match b/test/conformance/device/device_adapter_opencl.match
index 716ebd54fe..e69de29bb2 100644
--- a/test/conformance/device/device_adapter_opencl.match
+++ b/test/conformance/device/device_adapter_opencl.match
@@ -1 +0,0 @@
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HALF_FP_CONFIG
diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index 26358d49f6..5b6b99004f 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -10,6 +10,10 @@ else()
     set(AMD_ARCH "${UR_CONFORMANCE_AMD_ARCH}")
 endif()
 
+cmake_path(GET UR_DPCXX EXTENSION EXE)
+cmake_path(REPLACE_FILENAME UR_DPCXX "clang-offload-extract${EXE}" OUTPUT_VARIABLE DEFAULT_EXTRACTOR_NAME)
+set(UR_DEVICE_CODE_EXTRACTOR "${DEFAULT_EXTRACTOR_NAME}" CACHE PATH "Path to clang-offload-extract")
+
 if("${AMD_ARCH}" STREQUAL "" AND "${TARGET_TRIPLES}" MATCHES "amd")
     find_package(RocmAgentEnumerator)
     if(NOT ROCM_AGENT_ENUMERATOR_FOUND)
@@ -59,6 +63,8 @@ macro(add_device_binary SOURCE_FILE)
 
     foreach(TRIPLE ${TARGET_TRIPLES})
         set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}")
+        set(BIN_PATH "${DEVICE_BINARY_DIR}/${TRIPLE}.bin.0")
+
         if(${TRIPLE} MATCHES "amd")
             set(AMD_TARGET_BACKEND -Xsycl-target-backend=${TRIPLE})
             set(AMD_OFFLOAD_ARCH  --offload-arch=${AMD_ARCH})
@@ -81,17 +87,17 @@ macro(add_device_binary SOURCE_FILE)
             continue()
         endif()
 
-        add_custom_command(OUTPUT ${EXE_PATH}
+        add_custom_command(OUTPUT "${BIN_PATH}"
             COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off 
             ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB}
             ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH}
 
-            COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true
-            ${EXE_PATH} || exit 0
+            COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} ${UR_DEVICE_CODE_EXTRACTOR} --stem="${TRIPLE}.bin" ${EXE_PATH}
+
             WORKING_DIRECTORY "${DEVICE_BINARY_DIR}"
             DEPENDS ${SOURCE_FILE}
         )
-        add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${EXE_PATH})
+        add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${BIN_PATH})
         add_dependencies(generate_device_binaries generate_${KERNEL_NAME}_${TRIPLE})
     endforeach()
     list(APPEND DEVICE_CODE_SOURCES ${SOURCE_FILE})
@@ -106,6 +112,9 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/multiply.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant_multiple.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/usm_ll.cpp)
diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp
new file mode 100644
index 0000000000..a2bdaccf55
--- /dev/null
+++ b/test/conformance/device_code/cpy_and_mult.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <CL/sycl.hpp>
+
+int main() {
+    size_t array_size = 16;
+    cl::sycl::queue sycl_queue;
+    std::vector<uint32_t> src(array_size, 1);
+    std::vector<uint32_t> dst(array_size, 1);
+    auto src_buff =
+        cl::sycl::buffer<uint32_t>(src.data(), cl::sycl::range<1>(array_size));
+    auto dst_buff =
+        cl::sycl::buffer<uint32_t>(dst.data(), cl::sycl::range<1>(array_size));
+
+    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        auto src_acc = src_buff.get_access<cl::sycl::access::mode::read>(cgh);
+        auto dst_acc = dst_buff.get_access<cl::sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class cpy_and_mult>(
+            cl::sycl::range<1>{array_size},
+            [src_acc, dst_acc](cl::sycl::item<1> itemId) {
+                auto id = itemId.get_id(0);
+                dst_acc[id] = src_acc[id] * 2;
+            });
+    });
+    return 0;
+}
diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp
new file mode 100644
index 0000000000..e253dfe14b
--- /dev/null
+++ b/test/conformance/device_code/cpy_and_mult_usm.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <CL/sycl.hpp>
+
+int main() {
+    size_t array_size = 16;
+    cl::sycl::queue sycl_queue;
+    uint32_t *src = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    uint32_t *dst = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        cgh.parallel_for<class cpy_and_mult_usm>(
+            cl::sycl::range<1>{array_size},
+            [src, dst](cl::sycl::item<1> itemId) {
+                auto id = itemId.get_id(0);
+                dst[id] = src[id] * 2;
+            });
+    });
+    return 0;
+}
diff --git a/test/conformance/device_code/multiply.cpp b/test/conformance/device_code/multiply.cpp
new file mode 100644
index 0000000000..070cef18fd
--- /dev/null
+++ b/test/conformance/device_code/multiply.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+int main() {
+
+    const size_t inputSize = 1;
+    sycl::queue sycl_queue;
+    uint32_t *inputArray = sycl::malloc_shared<uint32_t>(inputSize, sycl_queue);
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class MultiplyBy2>(
+            sycl::range<1>(inputSize),
+            [=](sycl::id<1> itemID) { inputArray[itemID] *= 2; });
+    });
+    return 0;
+}
diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt
index 532cab1b85..7cc68203a0 100644
--- a/test/conformance/enqueue/CMakeLists.txt
+++ b/test/conformance/enqueue/CMakeLists.txt
@@ -29,4 +29,5 @@ add_conformance_test_with_kernels_environment(enqueue
     urEnqueueUSMPrefetch.cpp
     urEnqueueReadHostPipe.cpp
     urEnqueueWriteHostPipe.cpp
+    urEnqueueTimestampRecording.cpp
     )
diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp
new file mode 100644
index 0000000000..5fc8ee5547
--- /dev/null
+++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <uur/fixtures.h>
+
+struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
+        bool timestamp_recording_support = false;
+        ASSERT_SUCCESS(uur::GetTimestampRecordingSupport(
+            device, timestamp_recording_support));
+        if (!timestamp_recording_support) {
+            GTEST_SKIP() << "Timestamp recording is not supported";
+        }
+    }
+
+    void TearDown() override { urQueueTest::TearDown(); }
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueTimestampRecordingExpTest);
+
+void common_check(ur_event_handle_t event) {
+    // All successful runs should return a non-zero profiling results.
+    uint64_t queuedTime = 0, submitTime = 0, startTime = 0, endTime = 0;
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_QUEUED,
+                                sizeof(uint64_t), &queuedTime, nullptr));
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_SUBMIT,
+                                sizeof(uint64_t), &submitTime, nullptr));
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START,
+                                sizeof(uint64_t), &startTime, nullptr));
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
+                                           sizeof(uint64_t), &endTime,
+                                           nullptr));
+    ASSERT_TRUE(queuedTime > 0);
+    ASSERT_TRUE(submitTime > 0);
+    ASSERT_TRUE(startTime > 0);
+    ASSERT_TRUE(endTime > 0);
+    ASSERT_TRUE(queuedTime == submitTime);
+    ASSERT_TRUE(startTime == endTime);
+    ASSERT_TRUE(endTime >= submitTime);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, Success) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_SUCCESS(
+        urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    common_check(event);
+    ASSERT_SUCCESS(urEventRelease(event));
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_SUCCESS(
+        urEnqueueTimestampRecordingExp(queue, true, 0, nullptr, &event));
+    common_check(event);
+    ASSERT_SUCCESS(urEventRelease(event));
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(nullptr, false, 0, nullptr, &event),
+        UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPointerEvent) {
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPtrEventWaitList) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 1, nullptr, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+
+    ur_event_handle_t validEvent;
+    ASSERT_SUCCESS(urEnqueueEventsWait(queue, 0, nullptr, &validEvent));
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 0, &validEvent, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    ASSERT_SUCCESS(urEventRelease(validEvent));
+
+    ur_event_handle_t invalidEvent = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 0, &invalidEvent, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+}
diff --git a/test/conformance/integration/CMakeLists.txt b/test/conformance/integration/CMakeLists.txt
new file mode 100644
index 0000000000..1689cf04f4
--- /dev/null
+++ b/test/conformance/integration/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+add_conformance_test_with_kernels_environment(integration
+        QueueEmptyStatus.cpp
+        QueueUSM.cpp
+        QueueBuffer.cpp
+  )
diff --git a/test/conformance/integration/QueueBuffer.cpp b/test/conformance/integration/QueueBuffer.cpp
new file mode 100644
index 0000000000..d801ebf684
--- /dev/null
+++ b/test/conformance/integration/QueueBuffer.cpp
@@ -0,0 +1,108 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueBufferTestWithParam : uur::IntegrationQueueTestWithParam {
+    void SetUp() override {
+        program_name = "cpy_and_mult";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+    }
+
+    void TearDown() override { uur::IntegrationQueueTestWithParam::TearDown(); }
+
+    void verifyResults(ur_mem_handle_t Buffer, uint32_t ExpectedValue) {
+        uint32_t HostMem[ArraySize] = {};
+        ASSERT_SUCCESS(urEnqueueMemBufferRead(Queue, Buffer, true, 0,
+                                              sizeof(uint32_t) * ArraySize,
+                                              HostMem, 0, nullptr, nullptr));
+
+        for (uint32_t i : HostMem) {
+            ASSERT_EQ(i, ExpectedValue);
+        }
+    }
+
+    ur_mem_handle_t Buffer1 = nullptr;
+    ur_mem_handle_t Buffer2 = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueBufferTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits multiple kernels that interact with each other by accessing and
+ * writing to the same buffers.
+ * Checks that when using an IN_ORDER queue, no synchronization is needed
+ * between calls to urEnqueueKernelLaunch.
+ * Checks that when using an OUT_OF_ORDER queue, synchronizing using only
+ * event barriers is enough. */
+TEST_P(QueueBufferTestWithParam, QueueBufferTest) {
+
+    std::vector<ur_event_handle_t> EventsFill;
+    ur_event_handle_t Event;
+
+    size_t Buffer1Index;
+    size_t Buffer2Index;
+    ASSERT_NO_FATAL_FAILURE(
+        AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer1, &Buffer1Index));
+    ASSERT_NO_FATAL_FAILURE(
+        AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer2, &Buffer2Index));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(
+        Queue, Buffer1, &InitialValue, sizeof(uint32_t), 0,
+        ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(
+        Queue, Buffer2, &InitialValue, sizeof(uint32_t), 0,
+        ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill));
+
+    constexpr size_t GlobalOffset = 0;
+    constexpr size_t NDimensions = 1;
+    constexpr uint32_t NumIterations = 5;
+
+    uint32_t CurValueMem1 = InitialValue;
+    uint32_t CurValueMem2 = InitialValue;
+    for (uint32_t i = 0; i < NumIterations; ++i) {
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer2));
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem2 = CurValueMem1 * 2;
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer2));
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem1 = CurValueMem2 * 2;
+    }
+
+    ASSERT_SUCCESS(urQueueFinish(Queue));
+
+    ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer1, CurValueMem1));
+    ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer2, CurValueMem2));
+}
diff --git a/test/conformance/integration/QueueEmptyStatus.cpp b/test/conformance/integration/QueueEmptyStatus.cpp
new file mode 100644
index 0000000000..b8f1517b70
--- /dev/null
+++ b/test/conformance/integration/QueueEmptyStatus.cpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam {
+
+    void SetUp() override {
+
+        program_name = "multiply";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+
+        ur_device_usm_access_capability_flags_t shared_usm_flags = 0;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &SharedMem));
+    }
+
+    void TearDown() override {
+        ASSERT_SUCCESS(urUSMFree(context, SharedMem));
+        uur::IntegrationQueueTestWithParam::TearDown();
+    }
+
+    void submitWorkToQueue() {
+        ur_event_handle_t Event;
+        ASSERT_SUCCESS(
+            urEnqueueUSMFill(Queue, SharedMem, sizeof(uint32_t), &InitialValue,
+                             ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &SharedMem));
+
+        constexpr size_t global_offset = 0;
+        constexpr size_t n_dimensions = 1;
+        constexpr uint32_t num_iterations = 5;
+        for (uint32_t i = 0; i < num_iterations; ++i) {
+            ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, n_dimensions,
+                                                 &global_offset, &ArraySize,
+                                                 nullptr, 0, nullptr, &Event));
+            ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+        }
+
+        ASSERT_SUCCESS(urQueueFlush(Queue));
+    }
+
+    void waitUntilQueueEmpty() const {
+
+        using namespace std::chrono_literals;
+
+        constexpr auto step = 500ms;
+        constexpr auto maxWait = 5000ms;
+
+        /* Wait a bit until work finishes running. We don't synchronize with
+         * urQueueFinish() because we want to check if the status is set without
+         * calling it explicitly. */
+        for (auto currentWait = 0ms; currentWait < maxWait;
+             currentWait += step) {
+            std::this_thread::sleep_for(step);
+
+            ur_bool_t is_queue_empty;
+            ASSERT_SUCCESS(urQueueGetInfo(Queue, UR_QUEUE_INFO_EMPTY,
+                                          sizeof(ur_bool_t), &is_queue_empty,
+                                          nullptr));
+            if (is_queue_empty) {
+                return;
+            }
+        }
+
+        /* If we are here, the test failed. Let's call queue finish to avoid
+         * issues when freeing memory */
+        ASSERT_SUCCESS(urQueueFinish(Queue));
+        GTEST_FAIL();
+    }
+
+    void *SharedMem = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueEmptyStatusTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits kernels that have a dependency on each other and checks that the
+ * queue submits all the work in the correct order to the device.
+ * Explicit synchronization (except for barriers) is avoided in these tests to
+ * check that the properties of In-Order and OutOfOrder queues are working as
+ * expected */
+TEST_P(QueueEmptyStatusTestWithParam, QueueEmptyStatusTest) {
+    ASSERT_NO_FATAL_FAILURE(submitWorkToQueue());
+    ASSERT_NO_FATAL_FAILURE(waitUntilQueueEmpty());
+
+    constexpr size_t expected_value = 3200;
+    for (uint32_t i = 0; i < ArraySize; ++i) {
+        ASSERT_EQ(reinterpret_cast<uint32_t *>(SharedMem)[i], expected_value);
+    }
+}
diff --git a/test/conformance/integration/QueueUSM.cpp b/test/conformance/integration/QueueUSM.cpp
new file mode 100644
index 0000000000..cc8201453a
--- /dev/null
+++ b/test/conformance/integration/QueueUSM.cpp
@@ -0,0 +1,117 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueUSMTestWithParam : uur::IntegrationQueueTestWithParam {
+    void SetUp() override {
+        program_name = "cpy_and_mult_usm";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+
+        ur_device_usm_access_capability_flags_t shared_usm_flags = 0;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &DeviceMem1));
+
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &DeviceMem2));
+    }
+
+    void TearDown() override {
+        ASSERT_SUCCESS(urUSMFree(context, DeviceMem1));
+        ASSERT_SUCCESS(urUSMFree(context, DeviceMem2));
+        uur::IntegrationQueueTestWithParam::TearDown();
+    }
+
+    void verifyResults(void *DeviceMem, uint32_t ExpectedValue) {
+        uint32_t HostMem[ArraySize] = {};
+        ASSERT_SUCCESS(urEnqueueUSMMemcpy(Queue, true, HostMem, DeviceMem,
+                                          sizeof(uint32_t) * ArraySize, 0,
+                                          nullptr, nullptr));
+
+        for (uint32_t i : HostMem) {
+            ASSERT_EQ(i, ExpectedValue);
+        }
+    }
+
+    void *DeviceMem1 = nullptr;
+    void *DeviceMem2 = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueUSMTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits multiple kernels that interact with each other by accessing and
+ * writing to the same USM memory locations.
+ * Checks that when using an IN_ORDER queue, no synchronization is needed
+ * between calls to urEnqueueKernelLaunch.
+ * Checks that when using an OUT_OF_ORDER queue, synchronizing using only
+ * event barriers is enough. */
+TEST_P(QueueUSMTestWithParam, QueueUSMTest) {
+
+    std::vector<ur_event_handle_t> EventsFill;
+    ur_event_handle_t Event;
+    ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem1, sizeof(uint32_t),
+                                    &InitialValue, ArraySize * sizeof(uint32_t),
+                                    0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem2, sizeof(uint32_t),
+                                    &InitialValue, ArraySize * sizeof(uint32_t),
+                                    0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill));
+
+    constexpr size_t GlobalOffset = 0;
+    constexpr size_t NDimensions = 1;
+    constexpr uint32_t NumIterations = 5;
+
+    uint32_t CurValueMem1 = InitialValue;
+    uint32_t CurValueMem2 = InitialValue;
+
+    std::vector<ur_event_handle_t> EventsKernel;
+
+    for (uint32_t i = 0; i < NumIterations; ++i) {
+        /* Copy from DeviceMem2 to DeviceMem1 and multiply by 2 */
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem1));
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem2));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem2 = CurValueMem1 * 2;
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem2));
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem1 = CurValueMem2 * 2;
+    }
+
+    ASSERT_SUCCESS(urQueueFinish(Queue));
+
+    ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem1, CurValueMem1));
+    ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem2, CurValueMem2));
+}
diff --git a/test/conformance/integration/fixtures.h b/test/conformance/integration/fixtures.h
new file mode 100644
index 0000000000..aca70a5245
--- /dev/null
+++ b/test/conformance/integration/fixtures.h
@@ -0,0 +1,76 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef UR_CONFORMANCE_INTEGRATION_FIXTURES_H
+#define UR_CONFORMANCE_INTEGRATION_FIXTURES_H
+
+#include <uur/fixtures.h>
+
+namespace uur {
+
+struct IntegrationQueueTestWithParam
+    : uur::urKernelExecutionTestWithParam<ur_queue_flag_t> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urKernelExecutionTestWithParam<ur_queue_flag_t>::SetUp());
+
+        QueueFlags = getParam();
+        ur_queue_properties_t queue_properties = {
+            UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, QueueFlags};
+        ASSERT_SUCCESS(
+            urQueueCreate(context, device, &queue_properties, &Queue));
+    }
+
+    void TearDown() override {
+        for (ur_event_handle_t Event : AllEvents) {
+            ASSERT_SUCCESS(urEventRelease(Event));
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urKernelExecutionTestWithParam<ur_queue_flag_t>::TearDown());
+    }
+
+    void submitBarrierIfNeeded(std::vector<ur_event_handle_t> &(Events)) {
+        if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier(
+                Queue, Events.size(), Events.data(), nullptr));
+            AllEvents.insert(AllEvents.end(), Events.begin(), Events.end());
+        }
+    }
+
+    void submitBarrierIfNeeded(ur_event_handle_t Event) {
+        if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ASSERT_SUCCESS(
+                urEnqueueEventsWaitWithBarrier(Queue, 1, &Event, nullptr));
+            AllEvents.push_back(Event);
+        }
+    }
+
+    std::vector<ur_event_handle_t> AllEvents;
+    ur_queue_flags_t QueueFlags{};
+    ur_queue_handle_t Queue{};
+    static constexpr size_t ArraySize = 100;
+    static constexpr uint32_t InitialValue = 100;
+
+    static std::string
+    paramPrinter(const ::testing::TestParamInfo<
+                 std::tuple<ur_device_handle_t, ur_queue_flag_t>> &info) {
+        auto device = std::get<0>(info.param);
+        auto param = std::get<1>(info.param);
+
+        std::stringstream ss;
+        if (param == 0) {
+            ss << "IN_ORDER_QUEUE";
+        }
+        if (param == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ss << "OUT_OF_ORDER_QUEUE";
+        }
+
+        return uur::GetPlatformAndDeviceName(device) + "__" + ss.str();
+    }
+};
+} // namespace uur
+
+#endif //UR_CONFORMANCE_INTEGRATION_FIXTURES_H
diff --git a/test/conformance/integration/integration_adapter_cuda.match b/test/conformance/integration/integration_adapter_cuda.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/integration/integration_adapter_hip.match b/test/conformance/integration/integration_adapter_hip.match
new file mode 100644
index 0000000000..7016ca68d6
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_hip.match
@@ -0,0 +1,3 @@
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/AMD_HIP_BACKEND___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/AMD_HIP_BACKEND___{{.*}}___OUT_OF_ORDER_QUEUE
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/integration/integration_adapter_level_zero.match b/test/conformance/integration/integration_adapter_level_zero.match
new file mode 100644
index 0000000000..905fdea60f
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_level_zero.match
@@ -0,0 +1,6 @@
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
+{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
+{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
diff --git a/test/conformance/integration/integration_adapter_native_cpu.match b/test/conformance/integration/integration_adapter_native_cpu.match
new file mode 100644
index 0000000000..07afde2cef
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_native_cpu.match
@@ -0,0 +1,7 @@
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/integration/integration_adapter_opencl.match b/test/conformance/integration/integration_adapter_opencl.match
new file mode 100644
index 0000000000..57a5299327
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_opencl.match
@@ -0,0 +1,6 @@
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
diff --git a/test/conformance/program/program_adapter_native_cpu.match b/test/conformance/program/program_adapter_native_cpu.match
index c509c67d3e..9a5b0a9830 100644
--- a/test/conformance/program/program_adapter_native_cpu.match
+++ b/test/conformance/program/program_adapter_native_cpu.match
@@ -138,3 +138,4 @@
 {{OPT}}urProgramSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/program/urProgramBuild.cpp b/test/conformance/program/urProgramBuild.cpp
index f99b94321e..a7e7e4a275 100644
--- a/test/conformance/program/urProgramBuild.cpp
+++ b/test/conformance/program/urProgramBuild.cpp
@@ -30,8 +30,7 @@ TEST_P(urProgramBuildTest, InvalidNullHandleProgram) {
 TEST_P(urProgramBuildTest, BuildFailure) {
     ur_program_handle_t program = nullptr;
     std::shared_ptr<std::vector<char>> il_binary;
-    uur::KernelsEnvironment::instance->LoadSource("build_failure", 0,
-                                                  il_binary);
+    uur::KernelsEnvironment::instance->LoadSource("build_failure", il_binary);
     if (!il_binary) {
         // The build failure we are testing for happens at SYCL compile time on
         // AMD and Nvidia, so no binary exists to check for a build failure
diff --git a/test/conformance/program/urProgramCreateWithIL.cpp b/test/conformance/program/urProgramCreateWithIL.cpp
index 800a43cd5d..3d81d14104 100644
--- a/test/conformance/program/urProgramCreateWithIL.cpp
+++ b/test/conformance/program/urProgramCreateWithIL.cpp
@@ -17,7 +17,7 @@ struct urProgramCreateWithILTest : uur::urContextTest {
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             GTEST_SKIP();
         }
-        uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary);
+        uur::KernelsEnvironment::instance->LoadSource("foo", il_binary);
     }
 
     void TearDown() override {
diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match
index c2887b1063..a4c2c502a4 100644
--- a/test/conformance/queue/queue_adapter_native_cpu.match
+++ b/test/conformance/queue/queue_adapter_native_cpu.match
@@ -1,3 +1,5 @@
+urQueueCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urQueueCreateTest.CheckContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE
@@ -9,6 +11,17 @@ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_C
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE_DEFAULT
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_DISCARD_EVENTS
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_LOW
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_HIGH
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_BATCHED
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM
 urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT
diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp
index 03cda76d50..168285d3f8 100644
--- a/test/conformance/queue/urQueueCreate.cpp
+++ b/test/conformance/queue/urQueueCreate.cpp
@@ -2,16 +2,24 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "uur/raii.h"
 #include <uur/fixtures.h>
 
 using urQueueCreateTest = uur::urContextTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateTest);
 
 TEST_P(urQueueCreateTest, Success) {
-    ur_queue_handle_t queue = nullptr;
-    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue));
+    uur::raii::Queue queue = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr()));
     ASSERT_NE(nullptr, queue);
-    ASSERT_SUCCESS(urQueueRelease(queue));
+
+    ur_queue_info_t queue_flags;
+    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS,
+                                  sizeof(ur_queue_info_t), &queue_flags,
+                                  nullptr));
+
+    /* Check that the queue was created without any flag */
+    ASSERT_EQ(queue_flags, 0);
 }
 
 using urQueueCreateWithParamTest = uur::urContextTestWithParam<ur_queue_flag_t>;
@@ -54,9 +62,87 @@ TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) {
                                   nullptr));
     ASSERT_TRUE(queueFlags & queryFlag);
 
+    // Check that no other bit is set (i.e. is power of 2)
+    ASSERT_TRUE(queueFlags != 0 && (queueFlags & (queueFlags - 1)) == 0);
+
     ASSERT_SUCCESS(urQueueRelease(queue));
 }
 
+/* Creates two queues with the same platform and device, and checks that the
+ * queried device and platform of both queues match. */
+TEST_P(urQueueCreateWithParamTest, MatchingDeviceHandles) {
+    ur_queue_flags_t supportedFlags{};
+    ASSERT_SUCCESS(uur::GetDeviceQueueOnHostProperties(device, supportedFlags));
+
+    ur_queue_flags_t queryFlag = getParam();
+    if (!(supportedFlags & queryFlag)) {
+        GTEST_SKIP() << queryFlag << " : is not supported by the device.";
+    }
+
+    ur_queue_properties_t props = {
+        /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
+        /*.pNext =*/nullptr,
+        /*.flags =*/queryFlag,
+    };
+
+    uur::raii::Queue queue1 = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue1.ptr()));
+    ASSERT_NE(queue1, nullptr);
+
+    uur::raii::Queue queue2 = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue2.ptr()));
+    ASSERT_NE(queue2, nullptr);
+
+    ur_device_handle_t deviceQueue1;
+    ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE,
+                                  sizeof(ur_device_handle_t), &deviceQueue1,
+                                  nullptr));
+
+    ur_device_handle_t deviceQueue2;
+    ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE,
+                                  sizeof(ur_device_handle_t), &deviceQueue2,
+                                  nullptr));
+
+    ASSERT_EQ(deviceQueue1, deviceQueue2);
+}
+
+/* Create a queue and check that it returns the right context*/
+TEST_P(urQueueCreateTest, CheckContext) {
+
+    uur::raii::Queue queue = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr()));
+    ASSERT_NE(queue.ptr(), nullptr);
+
+    ur_context_handle_t returned_context = nullptr;
+    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_CONTEXT,
+                                  sizeof(ur_context_handle_t),
+                                  &returned_context, nullptr));
+
+    ASSERT_EQ(this->context, returned_context);
+}
+
+using urQueueCreateTestMultipleDevices = uur::urAllDevicesTest;
+
+/* Create a queue using a context from a different device */
+TEST_F(urQueueCreateTestMultipleDevices, ContextFromWrongDevice) {
+
+    if (devices.size() < 2) {
+        GTEST_SKIP() << "Test requires at least 2 devices in the system";
+    }
+    ur_device_handle_t device1 = devices[0];
+    uur::raii::Context context1 = nullptr;
+    urContextCreate(1, &device1, nullptr, context1.ptr());
+
+    ur_device_handle_t device2 = devices[1];
+    uur::raii::Context context2 = nullptr;
+    urContextCreate(1, &device2, nullptr, context2.ptr());
+
+    ur_queue_handle_t queue = nullptr;
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_DEVICE,
+                     urQueueCreate(context2, device1, nullptr, &queue));
+    ASSERT_NE(queue, nullptr);
+}
+
 TEST_P(urQueueCreateTest, InvalidNullHandleContext) {
     ur_queue_handle_t queue = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index a58b3ecdbd..a5f83c0d80 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -367,7 +367,7 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv,
     return options;
 }
 
-std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) {
+std::string KernelsEnvironment::getTargetName() {
     std::stringstream IL;
 
     if (instance->GetDevices().size() == 0) {
@@ -382,66 +382,44 @@ std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) {
         error = "failed to get backend from platform.";
         return {};
     }
-    if (backend == UR_PLATFORM_BACKEND_HIP) {
-        return ".bin";
-    }
 
-    auto device = instance->GetDevices()[device_index];
-    std::string IL_version;
-    if (uur::GetDeviceILVersion(device, IL_version)) {
-        error = "failed to get device IL version";
+    std::string target = "";
+    switch (backend) {
+    case UR_PLATFORM_BACKEND_OPENCL:
+    case UR_PLATFORM_BACKEND_LEVEL_ZERO:
+        return "spir64";
+    case UR_PLATFORM_BACKEND_CUDA:
+        return "nvptx64-nvidia-cuda";
+    case UR_PLATFORM_BACKEND_HIP:
+        return "amdgcn-amd-amdhsa";
+    case UR_PLATFORM_BACKEND_NATIVE_CPU:
+        error = "native_cpu doesn't support kernel tests yet";
         return {};
-    }
-
-    // TODO: This potentially needs updating as more adapters are tested.
-    if (IL_version.find("SPIR-V") != std::string::npos) {
-        IL << ".spv";
-    } else if (IL_version.find("nvptx") != std::string::npos) {
-        IL << ".bin";
-    } else {
-        error = "Undefined IL version: " + IL_version;
+    default:
+        error = "unknown target.";
         return {};
     }
-
-    return IL.str();
 }
 
 std::string
-KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name,
-                                        uint32_t device_index) {
+KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name) {
     std::stringstream path;
     path << kernel_options.kernel_directory << "/" << kernel_name;
-    std::string il_postfix = getSupportedILPostfix(device_index);
-
-    if (il_postfix.empty()) {
-        return {};
-    }
-
-    std::string binary_name;
-    for (const auto &entry : filesystem::directory_iterator(path.str())) {
-        auto file_name = entry.path().filename().string();
-        if (file_name.find(il_postfix) != std::string::npos) {
-            binary_name = file_name;
-            break;
-        }
-    }
 
-    if (binary_name.empty()) {
-        error =
-            "failed retrieving kernel source path for kernel: " + kernel_name;
+    std::string target_name = getTargetName();
+    if (target_name.empty()) {
         return {};
     }
 
-    path << "/" << binary_name;
+    path << "/" << target_name << ".bin.0";
 
     return path.str();
 }
 
 void KernelsEnvironment::LoadSource(
-    const std::string &kernel_name, uint32_t device_index,
+    const std::string &kernel_name,
     std::shared_ptr<std::vector<char>> &binary_out) {
-    std::string source_path =
-        instance->getKernelSourcePath(kernel_name, device_index);
+    std::string source_path = instance->getKernelSourcePath(kernel_name);
 
     if (source_path.empty()) {
         FAIL() << error;
diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h
index bcb7e46661..acd255a3c2 100644
--- a/test/conformance/testing/include/uur/environment.h
+++ b/test/conformance/testing/include/uur/environment.h
@@ -72,7 +72,7 @@ struct KernelsEnvironment : DevicesEnvironment {
     virtual void SetUp() override;
     virtual void TearDown() override;
 
-    void LoadSource(const std::string &kernel_name, uint32_t device_index,
+    void LoadSource(const std::string &kernel_name,
                     std::shared_ptr<std::vector<char>> &binary_out);
 
     ur_result_t CreateProgram(ur_platform_handle_t hPlatform,
@@ -89,9 +89,8 @@ struct KernelsEnvironment : DevicesEnvironment {
   private:
     KernelOptions parseKernelOptions(int argc, char **argv,
                                      const std::string &kernels_default_dir);
-    std::string getKernelSourcePath(const std::string &kernel_name,
-                                    uint32_t device_index);
-    std::string getSupportedILPostfix(uint32_t device_index);
+    std::string getKernelSourcePath(const std::string &kernel_name);
+    std::string getTargetName();
 
     KernelOptions kernel_options;
     // mapping between kernels (full_path + kernel_name) and their saved source.
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 55f3f7c4f4..cf64aa13d3 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -20,6 +20,14 @@
     }                                                                          \
     (void)0
 
+#define UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(ret)                                 \
+    auto status = ret;                                                         \
+    if (status == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {                       \
+        GTEST_SKIP();                                                          \
+    } else {                                                                   \
+        ASSERT_EQ(status, UR_RESULT_SUCCESS);                                  \
+    }
+
 namespace uur {
 
 struct urPlatformTest : ::testing::Test {
@@ -321,7 +329,8 @@ template <class T> struct urMemImageTestWithParam : urContextTestWithParam<T> {
 struct urQueueTest : urContextTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp());
-        ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue));
+        ASSERT_SUCCESS(
+            urQueueCreate(context, device, &queue_properties, &queue));
         ASSERT_NE(queue, nullptr);
     }
 
@@ -332,6 +341,8 @@ struct urQueueTest : urContextTest {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTest::TearDown());
     }
 
+    ur_queue_properties_t queue_properties = {
+        UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
     ur_queue_handle_t queue = nullptr;
 };
 
@@ -339,7 +350,7 @@ struct urHostPipeTest : urQueueTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary));
+            uur::KernelsEnvironment::instance->LoadSource("foo", il_binary));
         ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
             platform, context, device, *il_binary, nullptr, &program));
 
@@ -382,7 +393,9 @@ struct urHostPipeTest : urQueueTest {
 template <class T> struct urQueueTestWithParam : urContextTestWithParam<T> {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::SetUp());
-        ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue));
+        ASSERT_SUCCESS(urQueueCreate(this->context, this->device,
+                                     &queue_properties, &queue));
+        ASSERT_NE(queue, nullptr);
     }
 
     void TearDown() override {
@@ -391,7 +404,8 @@ template <class T> struct urQueueTestWithParam : urContextTestWithParam<T> {
         }
         UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::TearDown());
     }
-
+    ur_queue_properties_t queue_properties = {
+        UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
     ur_queue_handle_t queue;
 };
 
@@ -1121,7 +1135,7 @@ struct urProgramTest : urQueueTest {
             GTEST_SKIP();
         }
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource(program_name, 0,
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
                                                           il_binary));
 
         const ur_program_properties_t properties = {
@@ -1145,9 +1159,9 @@ struct urProgramTest : urQueueTest {
     std::vector<ur_program_metadata_t> metadatas{};
 };
 
-template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
+template <class T> struct urProgramTestWithParam : urQueueTestWithParam<T> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam<T>::SetUp());
 
         ur_platform_backend_t backend;
         ASSERT_SUCCESS(urPlatformGetInfo(this->platform,
@@ -1160,7 +1174,7 @@ template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
         }
 
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource(program_name, 0,
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
                                                           il_binary));
         ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
             this->platform, this->context, this->device, *il_binary, nullptr,
@@ -1171,7 +1185,7 @@ template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
         if (program) {
             EXPECT_SUCCESS(urProgramRelease(program));
         }
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam<T>::TearDown());
     }
 
     std::shared_ptr<std::vector<char>> il_binary;
@@ -1246,20 +1260,16 @@ template <class T> struct urKernelTestWithParam : urBaseKernelTestWithParam<T> {
     }
 };
 
-struct urBaseKernelExecutionTest : urBaseKernelTest {
-    void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp());
-    }
+struct KernelLaunchHelper {
 
-    void TearDown() override {
-        for (auto &buffer : buffer_args) {
-            ASSERT_SUCCESS(urMemRelease(buffer));
-        }
-        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown());
-    }
+    KernelLaunchHelper(ur_platform_handle_t &platform,
+                       ur_context_handle_t &context, ur_kernel_handle_t &kernel,
+                       ur_queue_handle_t &queue)
+        : platform{platform}, context{context}, kernel{kernel}, queue{queue} {}
 
     // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
-    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index) {
         ur_mem_handle_t mem_handle = nullptr;
         ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
                                          nullptr, &mem_handle));
@@ -1270,6 +1280,9 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
         ASSERT_SUCCESS(urQueueFinish(queue));
         ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
                                             mem_handle));
+        if (buffer_index) {
+            *buffer_index = current_arg_index;
+        }
 
         // SYCL device kernels have different interfaces depending on the
         // backend being used. Typically a kernel which takes a buffer argument
@@ -1301,8 +1314,6 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
                                                &accessor));
             current_arg_index += 2;
         }
-
-        buffer_args.push_back(mem_handle);
         *out_buffer = mem_handle;
     }
 
@@ -1340,7 +1351,11 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
         ValidateBuffer<T>(buffer, size, validator);
     }
 
-    std::vector<ur_mem_handle_t> buffer_args;
+    ur_platform_handle_t &platform;
+    ur_context_handle_t &context;
+    ur_kernel_handle_t &kernel;
+    ur_queue_handle_t &queue;
+
     uint32_t current_arg_index = 0;
 };
 
@@ -1349,10 +1364,6 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::SetUp());
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::Build());
-        context = urBaseKernelTestWithParam<T>::context;
-        kernel = urBaseKernelTestWithParam<T>::kernel;
-        ASSERT_SUCCESS(urQueueCreate(
-            context, urBaseKernelTestWithParam<T>::device, 0, &queue));
     }
 
     void TearDown() override {
@@ -1360,91 +1371,85 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
             ASSERT_SUCCESS(urMemRelease(buffer));
         }
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::TearDown());
-        if (queue) {
-            EXPECT_SUCCESS(urQueueRelease(queue));
-        }
     }
 
-    // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
-    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
-        ur_mem_handle_t mem_handle = nullptr;
-        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
-                                         nullptr, &mem_handle));
-        char zero = 0;
-        ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero,
-                                              sizeof(zero), 0, size, 0, nullptr,
-                                              nullptr));
-        ASSERT_SUCCESS(urQueueFinish(queue));
-        ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
-                                            mem_handle));
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index = nullptr) {
+        helper.AddBuffer1DArg(size, out_buffer, buffer_index);
+        buffer_args.push_back(*out_buffer);
+    }
 
-        // SYCL device kernels have different interfaces depending on the
-        // backend being used. Typically a kernel which takes a buffer argument
-        // will take a pointer to the start of the buffer and a sycl::id param
-        // which is a struct that encodes the accessor to the buffer. However
-        // the AMD backend handles this differently and uses three separate
-        // arguments for each of the three dimensions of the accessor.
+    template <class K> void AddPodArg(K data) { helper.AddPodArg(data); }
 
-        ur_platform_backend_t backend;
-        ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam<T>::platform,
-                                         UR_PLATFORM_INFO_BACKEND,
-                                         sizeof(backend), &backend, nullptr));
-        if (backend == UR_PLATFORM_BACKEND_HIP) {
-            // this emulates the three offset params for buffer accessor on AMD.
-            size_t val = 0;
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
-                                               sizeof(size_t), nullptr, &val));
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
-                                               sizeof(size_t), nullptr, &val));
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
-                                               sizeof(size_t), nullptr, &val));
-            current_arg_index += 4;
-        } else {
-            // This emulates the offset struct sycl adds for a 1D buffer accessor.
-            struct {
-                size_t offsets[1] = {0};
-            } accessor;
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
-                                               sizeof(accessor), nullptr,
-                                               &accessor));
-            current_arg_index += 2;
+    void Launch1DRange(size_t global_size, size_t local_size = 1) {
+        helper.Launch1DRange(global_size, local_size);
+    }
+
+    template <class K>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
+                        std::function<bool(T &)> validator) {
+        helper.ValidateBuffer(buffer, size, validator);
+    }
+
+    template <class K>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, K value) {
+        helper.ValidateBuffer(buffer, size, value);
+    }
+
+  private:
+    KernelLaunchHelper helper = KernelLaunchHelper{
+        this->platform, this->context, this->kernel, this->queue};
+    std::vector<ur_mem_handle_t> buffer_args;
+};
+
+struct urBaseKernelExecutionTest : urBaseKernelTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp());
+    }
+
+    void TearDown() override {
+        for (auto &buffer : buffer_args) {
+            ASSERT_SUCCESS(urMemRelease(buffer));
         }
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown());
+    }
 
-        buffer_args.push_back(mem_handle);
-        *out_buffer = mem_handle;
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index = nullptr) {
+        helper.AddBuffer1DArg(size, out_buffer, buffer_index);
+        buffer_args.push_back(*out_buffer);
     }
 
-    template <class U> void AddPodArg(U data) {
-        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index,
-                                           sizeof(data), nullptr, &data));
-        current_arg_index++;
+    template <class T> void AddPodArg(T data) { helper.AddPodArg(data); }
+
+    void Launch1DRange(size_t global_size, size_t local_size = 1) {
+        helper.Launch1DRange(global_size, local_size);
     }
 
-    // Validate the contents of `buffer` according to the given validator.
-    template <class U>
+    template <class T>
     void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
-                        std::function<bool(U &)> validator) {
-        std::vector<U> read_buffer(size / sizeof(U));
-        ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size,
-                                              read_buffer.data(), 0, nullptr,
-                                              nullptr));
-        ASSERT_TRUE(
-            std::all_of(read_buffer.begin(), read_buffer.end(), validator));
+                        std::function<bool(T &)> validator) {
+        helper.ValidateBuffer(buffer, size, validator);
     }
 
-    // Helper that uses the generic validate function to check for a given value.
-    template <class U>
-    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) {
-        auto validator = [&value](U result) -> bool { return result == value; };
-
-        ValidateBuffer<U>(buffer, size, validator);
+    template <class T>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, T value) {
+        helper.ValidateBuffer(buffer, size, value);
     }
 
+  private:
+    KernelLaunchHelper helper =
+        KernelLaunchHelper{platform, context, kernel, queue};
     std::vector<ur_mem_handle_t> buffer_args;
-    uint32_t current_arg_index = 0;
-    ur_context_handle_t context;
-    ur_kernel_handle_t kernel;
-    ur_queue_handle_t queue;
+};
+
+template <class T>
+struct urKernelExecutionTestWithParam : urBaseKernelExecutionTestWithParam<T> {
+    void SetUp() {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urBaseKernelExecutionTestWithParam<T>::SetUp());
+        this->Build();
+    }
 };
 
 struct urKernelExecutionTest : urBaseKernelExecutionTest {
diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h
index d2341cc31b..34e5f7768b 100644
--- a/test/conformance/testing/include/uur/utils.h
+++ b/test/conformance/testing/include/uur/utils.h
@@ -395,6 +395,8 @@ ur_result_t GetDeviceMaxComputeQueueIndices(ur_device_handle_t device,
                                             uint32_t &max_indices);
 ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device,
                                          bool &support);
+ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device,
+                                         bool &support);
 
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count);
 
diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp
index 6cb8d3a90c..ab78f64bbc 100644
--- a/test/conformance/testing/source/utils.cpp
+++ b/test/conformance/testing/source/utils.cpp
@@ -635,6 +635,12 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device,
         device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, support);
 }
 
+ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device,
+                                         bool &support) {
+    return GetDeviceInfo<bool>(
+        device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support);
+}
+
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) {
     ur_device_partition_property_t desc;
     desc.type = UR_DEVICE_PARTITION_BY_COUNTS;
diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
index dbb7cdebd5..9cda954748 100644
--- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
+++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
@@ -1,88 +1,10 @@
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
 urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_INFO_ACCESS_MODE
-urVirtualMemGetInfoTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTest.InvalidEnumerationInfo/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM
-urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidPropSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidEnumerationFlags/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000
-urVirtualMemReserveTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemReserveTest.InvalidNullPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 752a3a839f..caf61e86f6 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -401,5 +401,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice,
     std::cout << prefix;
     printDeviceInfo<ur_bool_t>(
         hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(hDevice,
+                               UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP);
 }
 } // namespace urinfo