From f778e6f209ff6d6ce519b3c03ed6486e2d2fe2d5 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Fri, 2 Jun 2023 15:14:26 +0200
Subject: [PATCH 01/16] [ur][L0] add external level zero adapter

Allows fetching and building level zero adapter without the need
to use the entire SYCL codebase.
Convenient for testing and prototyping of other language runtimes.
---
 .github/workflows/cmake.yml               |   2 +-
 .gitignore                                |   3 +
 CMakeLists.txt                            |   1 +
 README.md                                 |   6 ++
 cmake/helpers.cmake                       |  28 ++---
 source/adapters/CMakeLists.txt            |   4 +
 source/adapters/level_zero/CMakeLists.txt | 119 ++++++++++++++++++++++
 7 files changed, 150 insertions(+), 13 deletions(-)
 create mode 100644 source/adapters/level_zero/CMakeLists.txt

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 75f1de272d..618c099a21 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -55,10 +55,10 @@ jobs:
         -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
         -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
         -DUR_ENABLE_TRACING=ON
-        -DUR_DEVELOPER_MODE=ON
         -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
         -DUR_BUILD_TESTS=ON
         -DUR_FORMAT_CPP_STYLE=ON
+        -DUR_BUILD_ADAPTER_L0=ON
         ${{matrix.libbacktrace}}
 
     - name: Generate source from spec, check for uncommitted diff
diff --git a/.gitignore b/.gitignore
index a1a488bc14..263b72d4d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,3 +83,6 @@ out/
 # IDE Files
 /.vscode
 /.devcontainer
+
+# External content
+/external
diff --git a/CMakeLists.txt b/CMakeLists.txt
index efb9565171..352965ca90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(UMA_BUILD_SHARED_LIBRARY "Build UMA as shared library" OFF)
 option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
 option(UR_BUILD_TOOLS "build ur tools" ON)
+option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/README.md b/README.md
index 6b34d4d299..c178788ceb 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,11 @@
 
 [![GHA build status](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg?branch=main)](https://github.com/oneapi-src/unified-runtime/actions)
 
+## Adapters
+Adapter implementations for Unified Runtime currently reside in the [SYCL repository](https://github.com/intel/llvm/tree/sycl/sycl/plugins/unified_runtime/ur). This branch contains scripts to automatically
+fetch and build them directly in the UR tree. The adapters are disabled by default,
+see cmake options for details.
+
 ## Contents
 
 This repo contains the following:
@@ -99,6 +104,7 @@ List of options provided by CMake:
 | UR_USE_MSAN | Enable MemorySanitizer (clang only) | ON/OFF | OFF |
 | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF |
 | UR_BUILD_TOOLS | Build tools | ON/OFF | ON |
+| UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF |
 
 **General**:
 
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 17ac97cd5f..a99286434a 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -57,22 +57,26 @@ endmacro()
 
 include(FetchContent)
 
-# A wrapper around FetchContent_Declare that supports git sparse checkout.
-# This is useful for including subprojects from large repositories.
-function(FetchContentSparse_Declare name GIT_REPOSITORY GIT_TAG GIT_DIR)
-    set(content-build-dir ${CMAKE_BINARY_DIR}/content-${name})
-    message(STATUS "Fetching sparse content ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}")
-    IF(NOT EXISTS ${content-build-dir})
-        file(MAKE_DIRECTORY ${content-build-dir})
+function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST)
+message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}")
+    IF(NOT EXISTS ${DEST})
+        file(MAKE_DIRECTORY ${DEST})
         execute_process(COMMAND git init -b main
-            WORKING_DIRECTORY ${content-build-dir})
+            WORKING_DIRECTORY ${DEST})
         execute_process(COMMAND git remote add origin ${GIT_REPOSITORY}
-            WORKING_DIRECTORY ${content-build-dir})
+            WORKING_DIRECTORY ${DEST})
         execute_process(COMMAND git config core.sparsecheckout true
-            WORKING_DIRECTORY ${content-build-dir})
-        file(APPEND ${content-build-dir}/.git/info/sparse-checkout ${GIT_DIR}/)
+            WORKING_DIRECTORY ${DEST})
+        file(APPEND ${DEST}/.git/info/sparse-checkout ${GIT_DIR}/)
     endif()
     execute_process(COMMAND git pull --depth=1 origin ${GIT_TAG}
-        WORKING_DIRECTORY ${content-build-dir})
+        WORKING_DIRECTORY ${DEST})
+endfunction()
+
+# A wrapper around FetchContent_Declare that supports git sparse checkout.
+# This is useful for including subprojects from large repositories.
+function(FetchContentSparse_Declare name GIT_REPOSITORY GIT_TAG GIT_DIR)
+    set(content-build-dir ${CMAKE_BINARY_DIR}/content-${name})
+    FetchSource(${GIT_REPOSITORY} ${GIT_TAG} ${GIT_DIR} ${content-build-dir})
     FetchContent_Declare(${name} SOURCE_DIR ${content-build-dir}/${GIT_DIR})
 endfunction()
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 7d7c0429e2..9964333e4a 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -2,3 +2,7 @@
 # SPDX-License-Identifier: MIT
 
 add_subdirectory(null)
+
+if(UR_BUILD_ADAPTER_L0)
+add_subdirectory(level_zero)
+endif()
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000..4c672b7c69
--- /dev/null
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -0,0 +1,119 @@
+# Copyright (C) 2022 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
+set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero")
+
+# fetch xpti proxy library for the tracing layer
+FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230601 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR})
+
+set(TARGET_NAME ur_adapter_level_zero)
+
+if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
+    message(STATUS "Download Level Zero loader and headers from github.com")
+
+    set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
+    set(LEVEL_ZERO_LOADER_TAG v1.8.8)
+
+    # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
+    set(CMAKE_INCLUDE_CURRENT_DIR OFF)
+    # Prevent L0 loader from exporting extra symbols
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
+
+    message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}")
+    include(FetchContent)
+    FetchContent_Declare(level-zero-loader
+        GIT_REPOSITORY    ${LEVEL_ZERO_LOADER_REPO}
+        GIT_TAG           ${LEVEL_ZERO_LOADER_TAG}
+    )
+    if (WIN32)
+        set(USE_Z7 ON)
+    endif()
+    FetchContent_MakeAvailable(level-zero-loader)
+    FetchContent_GetProperties(level-zero-loader)
+
+    set(LEVEL_ZERO_LIBRARY ze_loader)
+    set(LEVEL_ZERO_INCLUDE_DIR
+        ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers")
+endif()
+
+add_library (LevelZeroLoader INTERFACE)
+# The MSVC linker does not like / at the start of a path, so to work around this
+# we split it into a link library and a library path, where the path is allowed
+# to have leading /.
+get_filename_component(LEVEL_ZERO_LIBRARY_SRC "${LEVEL_ZERO_LIBRARY}" DIRECTORY)
+get_filename_component(LEVEL_ZERO_LIB_NAME "${LEVEL_ZERO_LIBRARY}" NAME)
+target_link_directories(LevelZeroLoader
+    INTERFACE "${LEVEL_ZERO_LIBRARY_SRC}"
+)
+target_link_libraries(LevelZeroLoader
+    INTERFACE "${LEVEL_ZERO_LIB_NAME}"
+)
+if (WIN32)
+    # TODO: fix building level-zero loader on Windows
+    # target_compile_options(${LEVEL_ZERO_LIB_NAME} /UUNICODE)
+else()
+    target_compile_options(${LEVEL_ZERO_LIB_NAME} PUBLIC
+        -Wno-unused-but-set-variable
+        -Wno-pedantic
+        -Wno-unused-parameter
+        -Wno-error
+    )
+endif()
+
+add_library (LevelZeroLoader-Headers INTERFACE)
+target_include_directories(LevelZeroLoader-Headers
+    INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}"
+)
+
+add_library(${TARGET_NAME}
+    SHARED
+    ${L0_DIR}/ur_loader_interface.cpp
+    ${L0_DIR}/ur_level_zero_common.hpp
+    ${L0_DIR}/ur_level_zero_context.hpp
+    ${L0_DIR}/ur_level_zero_device.hpp
+    ${L0_DIR}/ur_level_zero_event.hpp
+    ${L0_DIR}/ur_level_zero_mem.hpp
+    ${L0_DIR}/ur_level_zero_kernel.hpp
+    ${L0_DIR}/ur_level_zero_platform.hpp
+    ${L0_DIR}/ur_level_zero_program.hpp
+    ${L0_DIR}/ur_level_zero_queue.hpp
+    ${L0_DIR}/ur_level_zero_sampler.hpp
+    ${L0_DIR}/ur_level_zero.cpp
+    ${L0_DIR}/ur_level_zero_common.cpp
+    ${L0_DIR}/ur_level_zero_context.cpp
+    ${L0_DIR}/ur_level_zero_device.cpp
+    ${L0_DIR}/ur_level_zero_event.cpp
+    ${L0_DIR}/ur_level_zero_mem.cpp
+    ${L0_DIR}/ur_level_zero_kernel.cpp
+    ${L0_DIR}/ur_level_zero_platform.cpp
+    ${L0_DIR}/ur_level_zero_program.cpp
+    ${L0_DIR}/ur_level_zero_queue.cpp
+    ${L0_DIR}/ur_level_zero_sampler.cpp
+    ${L0_DIR}/../../ur.cpp
+    ${L0_DIR}/../../usm_allocator_config.hpp
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
+    SOVERSION "${PROJECT_VERSION_MAJOR}"
+)
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    LevelZeroLoader-Headers
+)
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${L0_DIR}/../../../
+    LevelZeroLoader-Headers
+)
+
+if(UNIX)
+    set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing")
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}")
+endif()
+

From 80e168ee95c5dc38c7d4b64354a7a7f8b3159d88 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Mon, 5 Jun 2023 12:10:44 +0200
Subject: [PATCH 02/16] [level-zero] fix build script and failing tests

---
 source/adapters/level_zero/CMakeLists.txt | 25 +++++++++++++++++------
 test/loader/platforms/CMakeLists.txt      |  2 +-
 test/loader/platforms/platforms.cpp       | 18 ++++++++--------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 4c672b7c69..cde6148dc9 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -28,12 +28,21 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
         GIT_REPOSITORY    ${LEVEL_ZERO_LOADER_REPO}
         GIT_TAG           ${LEVEL_ZERO_LOADER_TAG}
     )
-    if (WIN32)
-        set(USE_Z7 ON)
+    set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}")
+    if(MSVC)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX-")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /WX-")
+      # FIXME: Unified runtime build fails with /DUNICODE
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /UUNICODE")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /UUNICODE")
+      # USE_Z7 forces use of /Z7 instead of /Zi which is broken with sccache
+      set(USE_Z7 ON)
     endif()
     FetchContent_MakeAvailable(level-zero-loader)
     FetchContent_GetProperties(level-zero-loader)
 
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}")
+
     set(LEVEL_ZERO_LIBRARY ze_loader)
     set(LEVEL_ZERO_INCLUDE_DIR
         ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers")
@@ -51,10 +60,8 @@ target_link_directories(LevelZeroLoader
 target_link_libraries(LevelZeroLoader
     INTERFACE "${LEVEL_ZERO_LIB_NAME}"
 )
-if (WIN32)
-    # TODO: fix building level-zero loader on Windows
-    # target_compile_options(${LEVEL_ZERO_LIB_NAME} /UUNICODE)
-else()
+
+if (NOT MSVC)
     target_compile_options(${LEVEL_ZERO_LIB_NAME} PUBLIC
         -Wno-unused-but-set-variable
         -Wno-pedantic
@@ -75,6 +82,7 @@ add_library(${TARGET_NAME}
     ${L0_DIR}/ur_level_zero_context.hpp
     ${L0_DIR}/ur_level_zero_device.hpp
     ${L0_DIR}/ur_level_zero_event.hpp
+    ${L0_DIR}/ur_level_zero_usm.hpp
     ${L0_DIR}/ur_level_zero_mem.hpp
     ${L0_DIR}/ur_level_zero_kernel.hpp
     ${L0_DIR}/ur_level_zero_platform.hpp
@@ -86,6 +94,7 @@ add_library(${TARGET_NAME}
     ${L0_DIR}/ur_level_zero_context.cpp
     ${L0_DIR}/ur_level_zero_device.cpp
     ${L0_DIR}/ur_level_zero_event.cpp
+    ${L0_DIR}/ur_level_zero_usm.cpp
     ${L0_DIR}/ur_level_zero_mem.cpp
     ${L0_DIR}/ur_level_zero_kernel.cpp
     ${L0_DIR}/ur_level_zero_platform.cpp
@@ -93,6 +102,9 @@ add_library(${TARGET_NAME}
     ${L0_DIR}/ur_level_zero_queue.cpp
     ${L0_DIR}/ur_level_zero_sampler.cpp
     ${L0_DIR}/../../ur.cpp
+    ${L0_DIR}/../../usm_allocator.cpp
+    ${L0_DIR}/../../usm_allocator.hpp
+    ${L0_DIR}/../../usm_allocator_config.cpp
     ${L0_DIR}/../../usm_allocator_config.hpp
 )
 
@@ -104,6 +116,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
 target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::headers
     ${PROJECT_NAME}::common
+    LevelZeroLoader
     LevelZeroLoader-Headers
 )
 
diff --git a/test/loader/platforms/CMakeLists.txt b/test/loader/platforms/CMakeLists.txt
index de1fa76cfd..8db4fbb81c 100644
--- a/test/loader/platforms/CMakeLists.txt
+++ b/test/loader/platforms/CMakeLists.txt
@@ -31,5 +31,5 @@ function(add_loader_platform_test name ENV)
     )
 endfunction()
 
-add_loader_platform_test(no_platforms "")
+add_loader_platform_test(no_platforms "UR_ADAPTERS_FORCE_LOAD=\"\"")
 add_loader_platform_test(null_platform "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_null>\"")
diff --git a/test/loader/platforms/platforms.cpp b/test/loader/platforms/platforms.cpp
index f230025b26..2eaea35be9 100644
--- a/test/loader/platforms/platforms.cpp
+++ b/test/loader/platforms/platforms.cpp
@@ -17,32 +17,32 @@ using namespace logger;
 
 //////////////////////////////////////////////////////////////////////////
 int main(int argc, char *argv[]) {
-    logger::init("TEST");
+    auto out = create_logger("TEST");
 
     ur_result_t status;
 
     // Initialize the platform
     status = urInit(0);
     if (status != UR_RESULT_SUCCESS) {
-        error("urInit failed with return code: {}", status);
+        out.error("urInit failed with return code: {}", status);
         return 1;
     }
-    info("urInit succeeded.");
+    out.info("urInit succeeded.");
 
     uint32_t platformCount = 0;
     std::vector<ur_platform_handle_t> platforms;
 
     status = urPlatformGet(1, nullptr, &platformCount);
     if (status != UR_RESULT_SUCCESS) {
-        error("urPlatformGet failed with return code: {}", status);
+        out.error("urPlatformGet failed with return code: {}", status);
         goto out;
     }
-    info("urPlatformGet found {} platforms", platformCount);
+    out.info("urPlatformGet found {} platforms", platformCount);
 
     platforms.resize(platformCount);
     status = urPlatformGet(platformCount, platforms.data(), nullptr);
     if (status != UR_RESULT_SUCCESS) {
-        error("urPlatformGet failed with return code: {}", status);
+        out.error("urPlatformGet failed with return code: {}", status);
         goto out;
     }
 
@@ -51,7 +51,7 @@ int main(int argc, char *argv[]) {
         status =
             urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, 0, nullptr, &name_len);
         if (status != UR_RESULT_SUCCESS) {
-            error("urPlatformGetInfo failed with return code: {}", status);
+            out.error("urPlatformGetInfo failed with return code: {}", status);
             goto out;
         }
 
@@ -61,11 +61,11 @@ int main(int argc, char *argv[]) {
         status = urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, name_len, name,
                                    nullptr);
         if (status != UR_RESULT_SUCCESS) {
-            error("urPlatformGetInfo failed with return code: {}", status);
+            out.error("urPlatformGetInfo failed with return code: {}", status);
             free(name);
             goto out;
         }
-        info("Found {} ", name);
+        out.info("Found {} ", name);
 
         free(name);
     }

From 09994ffab02df637d01718eedfa1ea15f17c1a90 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Thu, 29 Jun 2023 13:34:45 +0200
Subject: [PATCH 03/16] update level-zero

---
 cmake/helpers.cmake                       | 6 ++++--
 source/adapters/level_zero/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index d0963d36c8..a4a3a70e78 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -60,7 +60,7 @@ endmacro()
 include(FetchContent)
 
 function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST)
-message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}")
+    message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}")
     IF(NOT EXISTS ${DEST})
         file(MAKE_DIRECTORY ${DEST})
         execute_process(COMMAND git init -b main
@@ -71,7 +71,9 @@ message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_T
             WORKING_DIRECTORY ${DEST})
         file(APPEND ${DEST}/.git/info/sparse-checkout ${GIT_DIR}/)
     endif()
-    execute_process(COMMAND git pull --depth=1 origin ${GIT_TAG}
+    execute_process(COMMAND git fetch --depth=1 origin refs/tags/${GIT_TAG}:refs/tags/${GIT_TAG}
+        WORKING_DIRECTORY ${DEST})
+    execute_process(COMMAND git checkout --quiet ${GIT_TAG}
         WORKING_DIRECTORY ${DEST})
 endfunction()
 
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index cde6148dc9..6bfd35084d 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -7,7 +7,7 @@ set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
 set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero")
 
 # fetch xpti proxy library for the tracing layer
-FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230601 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR})
+FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR})
 
 set(TARGET_NAME ur_adapter_level_zero)
 

From ed6e5b4a61103be96f4db3e8d3fb0fa912c5a183 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Thu, 29 Jun 2023 14:31:51 +0200
Subject: [PATCH 04/16] add cuda adapter

---
 CMakeLists.txt                                |   1 +
 README.md                                     |   1 +
 ...move-sycl-namespaces-from-ur-adapter.patch | 856 ++++++++++++++++++
 source/adapters/CMakeLists.txt                |  13 +
 source/adapters/cuda/CMakeLists.txt           |  78 ++
 source/adapters/level_zero/CMakeLists.txt     |   6 +-
 6 files changed, 950 insertions(+), 5 deletions(-)
 create mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
 create mode 100644 source/adapters/cuda/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b37322a26..bedb617d8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
 option(UR_BUILD_TOOLS "build ur tools" ON)
 option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
+option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/README.md b/README.md
index d292b59012..e9ccf4a3a3 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,7 @@ List of options provided by CMake:
 | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF |
 | UR_BUILD_TOOLS | Build tools | ON/OFF | ON |
 | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF |
+| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
 
 **General**:
 
diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
new file mode 100644
index 0000000000..e7214e94b2
--- /dev/null
+++ b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
@@ -0,0 +1,856 @@
+From e30434a4a9e893f24e0bb18051576f297d1f4f08 Mon Sep 17 00:00:00 2001
+From: pbalcer <piotr.balcer@intel.com>
+Date: Thu, 29 Jun 2023 14:26:26 +0200
+Subject: [PATCH] [SYCL][CUDA] remove sycl namespaces from ur adapter
+
+---
+ .../ur/adapters/cuda/common.cpp               |   6 +-
+ .../ur/adapters/cuda/common.hpp               |   5 -
+ .../ur/adapters/cuda/context.cpp              |   2 +-
+ .../ur/adapters/cuda/device.cpp               | 170 +++++++++---------
+ .../ur/adapters/cuda/enqueue.cpp              |   2 +-
+ .../ur/adapters/cuda/event.cpp                |  12 +-
+ .../ur/adapters/cuda/kernel.cpp               |  26 +--
+ .../ur/adapters/cuda/memory.cpp               |   4 +-
+ .../ur/adapters/cuda/queue.cpp                |   2 +-
+ .../ur/adapters/cuda/sampler.cpp              |   2 +-
+ 10 files changed, 113 insertions(+), 118 deletions(-)
+
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
+index 86975e509..83264160e 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
+@@ -72,17 +72,17 @@ std::string getCudaVersionString() {
+   return stream.str();
+ }
+ 
+-void sycl::detail::ur::die(const char *Message) {
++void detail::ur::die(const char *Message) {
+   std::cerr << "ur_die: " << Message << std::endl;
+   std::terminate();
+ }
+ 
+-void sycl::detail::ur::assertion(bool Condition, const char *Message) {
++void detail::ur::assertion(bool Condition, const char *Message) {
+   if (!Condition)
+     die(Message);
+ }
+ 
+-void sycl::detail::ur::cuPrint(const char *Message) {
++void detail::ur::cuPrint(const char *Message) {
+   std::cerr << "ur_print: " << Message << std::endl;
+ }
+ 
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
+index 5cfa60901..82b38c10d 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
+@@ -8,7 +8,6 @@
+ #pragma once
+ 
+ #include <cuda.h>
+-#include <sycl/detail/defines.hpp>
+ #include <ur/ur.hpp>
+ 
+ ur_result_t mapErrorUR(CUresult Result);
+@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize];
+                                       ur_result_t ErrorCode);
+ 
+ /// ------ Error handling, matching OpenCL plugin semantics.
+-namespace sycl {
+-__SYCL_INLINE_VER_NAMESPACE(_V1) {
+ namespace detail {
+ namespace ur {
+ 
+@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr);
+ 
+ } // namespace ur
+ } // namespace detail
+-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+-} // namespace sycl
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
+index 74a32bdac..2b621383d 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
+@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
+   }
+   case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hContext->getDevice()->get()) == CUDA_SUCCESS);
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+index 24f9d52a0..c6b6bc07e 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+@@ -15,7 +15,7 @@
+ 
+ int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
+   int value;
+-  sycl::detail::ur::assertion(
++  detail::ur::assertion(
+       cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
+   return value;
+ }
+@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
+     int ComputeUnits = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&ComputeUnits,
+                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(ComputeUnits >= 0);
++    detail::ur::assertion(ComputeUnits >= 0);
+     return ReturnValue(static_cast<uint32_t>(ComputeUnits));
+   }
+   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
+@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     } ReturnSizes;
+ 
+     int MaxX = 0, MaxY = 0, MaxZ = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxX >= 0);
++    detail::ur::assertion(MaxX >= 0);
+ 
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxY >= 0);
++    detail::ur::assertion(MaxY >= 0);
+ 
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxZ >= 0);
++    detail::ur::assertion(MaxZ >= 0);
+ 
+     ReturnSizes.Sizes[0] = size_t(MaxX);
+     ReturnSizes.Sizes[1] = size_t(MaxY);
+@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+       size_t Sizes[MaxWorkItemDimensions];
+     } ReturnSizes;
+     int MaxX = 0, MaxY = 0, MaxZ = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxX >= 0);
++    detail::ur::assertion(MaxX >= 0);
+ 
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxY >= 0);
++    detail::ur::assertion(MaxY >= 0);
+ 
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(MaxZ >= 0);
++    detail::ur::assertion(MaxZ >= 0);
+ 
+     ReturnSizes.Sizes[0] = size_t(MaxX);
+     ReturnSizes.Sizes[1] = size_t(MaxY);
+@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+ 
+   case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+     int MaxWorkGroupSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxWorkGroupSize,
+                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+-    sycl::detail::ur::assertion(MaxWorkGroupSize >= 0);
++    detail::ur::assertion(MaxWorkGroupSize >= 0);
+ 
+     return ReturnValue(size_t(MaxWorkGroupSize));
+   }
+@@ -172,12 +172,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
+     // Number of sub-groups = max block size / warp size + possible remainder
+     int MaxThreads = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxThreads,
+                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                              hDevice->get()) == CUDA_SUCCESS);
+     int WarpSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                              hDevice->get()) == CUDA_SUCCESS);
+     int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     // Volta provides independent thread scheduling
+     // TODO: Revisit for previous generation GPUs
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+ 
+   case UR_DEVICE_INFO_ATOMIC_64: {
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_BFLOAT16: {
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -266,7 +266,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+     // NVIDIA devices only support one sub-group size (the warp size)
+     int WarpSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                              hDevice->get()) == CUDA_SUCCESS);
+     size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
+@@ -274,10 +274,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
+     int ClockFreq = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(ClockFreq >= 0);
++    detail::ur::assertion(ClockFreq >= 0);
+     return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
+   }
+   case UR_DEVICE_INFO_ADDRESS_BITS: {
+@@ -292,7 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     // CL_DEVICE_TYPE_CUSTOM.
+ 
+     size_t Global = 0;
+-    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
++    detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
+                                 CUDA_SUCCESS);
+ 
+     auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
+@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
+       Enabled = true;
+     } else {
+-      sycl::detail::ur::cuPrint(
++      detail::ur::cuPrint(
+           "Images are not fully supported by the CUDA BE, their support is "
+           "disabled by default. Their partial support can be activated by "
+           "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
+@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
+     // Take the smaller of maximum surface and maximum texture height.
+     int TexHeight = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexHeight,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexHeight >= 0);
++    detail::ur::assertion(TexHeight >= 0);
+     int SurfHeight = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfHeight,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfHeight >= 0);
++    detail::ur::assertion(SurfHeight >= 0);
+ 
+     int Min = std::min(TexHeight, SurfHeight);
+ 
+@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
+     // Take the smaller of maximum surface and maximum texture width.
+     int TexWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexWidth >= 0);
++    detail::ur::assertion(TexWidth >= 0);
+     int SurfWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfWidth >= 0);
++    detail::ur::assertion(SurfWidth >= 0);
+ 
+     int Min = std::min(TexWidth, SurfWidth);
+ 
+@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
+     // Take the smaller of maximum surface and maximum texture height.
+     int TexHeight = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexHeight,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexHeight >= 0);
++    detail::ur::assertion(TexHeight >= 0);
+     int SurfHeight = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfHeight,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfHeight >= 0);
++    detail::ur::assertion(SurfHeight >= 0);
+ 
+     int Min = std::min(TexHeight, SurfHeight);
+ 
+@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
+     // Take the smaller of maximum surface and maximum texture width.
+     int TexWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexWidth >= 0);
++    detail::ur::assertion(TexWidth >= 0);
+     int SurfWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfWidth >= 0);
++    detail::ur::assertion(SurfWidth >= 0);
+ 
+     int Min = std::min(TexWidth, SurfWidth);
+ 
+@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
+     // Take the smaller of maximum surface and maximum texture depth.
+     int TexDepth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexDepth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexDepth >= 0);
++    detail::ur::assertion(TexDepth >= 0);
+     int SurfDepth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfDepth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfDepth >= 0);
++    detail::ur::assertion(SurfDepth >= 0);
+ 
+     int Min = std::min(TexDepth, SurfDepth);
+ 
+@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
+     // Take the smaller of maximum surface and maximum texture width.
+     int TexWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&TexWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(TexWidth >= 0);
++    detail::ur::assertion(TexWidth >= 0);
+     int SurfWidth = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&SurfWidth,
+                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(SurfWidth >= 0);
++    detail::ur::assertion(SurfWidth >= 0);
+ 
+     int Min = std::min(TexWidth, SurfWidth);
+ 
+@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
+     int MemBaseAddrAlign = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MemBaseAddrAlign,
+                              CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
+     int CacheSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(CacheSize >= 0);
++    detail::ur::assertion(CacheSize >= 0);
+     // The L2 cache is global to the GPU.
+     return ReturnValue(static_cast<uint64_t>(CacheSize));
+   }
+   case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+     size_t Bytes = 0;
+     // Runtime API has easy access to this value, driver API info is scarse.
+-    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
++    detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
+                                 CUDA_SUCCESS);
+     return ReturnValue(uint64_t{Bytes});
+   }
+   case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
+     int ConstantMemory = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&ConstantMemory,
+                              CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(ConstantMemory >= 0);
++    detail::ur::assertion(ConstantMemory >= 0);
+ 
+     return ReturnValue(static_cast<uint64_t>(ConstantMemory));
+   }
+@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     // CUDA has its own definition of "local memory", which maps to OpenCL's
+     // "private memory".
+     int LocalMemSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&LocalMemSize,
+                              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(LocalMemSize >= 0);
++    detail::ur::assertion(LocalMemSize >= 0);
+     return ReturnValue(static_cast<uint64_t>(LocalMemSize));
+   }
+   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
+     int ECCEnabled = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+-    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
++    detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
+     auto Result = static_cast<bool>(ECCEnabled);
+     return ReturnValue(Result);
+   }
+   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
+     int IsIntegrated = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+-    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
++    detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
+     auto result = static_cast<bool>(IsIntegrated);
+     return ReturnValue(result);
+   }
+@@ -620,7 +620,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_NAME: {
+     static constexpr size_t MaxDeviceNameLength = 256u;
+     char Name[MaxDeviceNameLength];
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
+         CUDA_SUCCESS);
+     return ReturnValue(Name, strlen(Name) + 1);
+@@ -641,13 +641,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_VERSION: {
+     std::stringstream SS;
+     int Major;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+     SS << Major;
+     int Minor;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Minor,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -666,11 +666,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     int Major = 0;
+     int Minor = 0;
+ 
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Minor,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -847,27 +847,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
+     size_t FreeMemory = 0;
+     size_t TotalMemory = 0;
+-    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
++    detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
+                                     CUDA_SUCCESS,
+                                 "failed cuMemGetInfo() API.");
+     return ReturnValue(FreeMemory);
+   }
+   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
+     int Value = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(Value >= 0);
++    detail::ur::assertion(Value >= 0);
+     // Convert kilohertz to megahertz when returning.
+     return ReturnValue(Value / 1000);
+   }
+   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
+     int Value = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Value,
+                              CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(Value >= 0);
++    detail::ur::assertion(Value >= 0);
+     return ReturnValue(Value);
+   }
+   case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
+@@ -875,10 +875,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_DEVICE_ID: {
+     int Value = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(Value >= 0);
++    detail::ur::assertion(Value >= 0);
+     return ReturnValue(Value);
+   }
+   case UR_DEVICE_INFO_UUID: {
+@@ -888,10 +888,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     int Minor = DriverVersion % 1000 / 10;
+     CUuuid UUID;
+     if ((Major > 11) || (Major == 11 && Minor >= 4)) {
+-      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
++      detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
+                                   CUDA_SUCCESS);
+     } else {
+-      sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
++      detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
+                                   CUDA_SUCCESS);
+     }
+     std::array<unsigned char, 16> Name;
+@@ -900,13 +900,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
+     int Major = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Major,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+     int Minor = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&Minor,
+                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                              hDevice->get()) == CUDA_SUCCESS);
+@@ -922,7 +922,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     } else if (IsOrinAGX) {
+       MemoryClockKHz = 3200000;
+     } else {
+-      sycl::detail::ur::assertion(
++      detail::ur::assertion(
+           cuDeviceGetAttribute(&MemoryClockKHz,
+                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                                hDevice->get()) == CUDA_SUCCESS);
+@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     if (IsOrinAGX) {
+       MemoryBusWidth = 256;
+     } else {
+-      sycl::detail::ur::assertion(
++      detail::ur::assertion(
+           cuDeviceGetAttribute(&MemoryBusWidth,
+                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                                hDevice->get()) == CUDA_SUCCESS);
+@@ -977,7 +977,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+         &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+         hDevice->get()));
+ 
+-    sycl::detail::ur::assertion(MaxRegisters >= 0);
++    detail::ur::assertion(MaxRegisters >= 0);
+ 
+     return ReturnValue(static_cast<uint32_t>(MaxRegisters));
+   }
+@@ -988,11 +988,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   case UR_DEVICE_INFO_PCI_ADDRESS: {
+     constexpr size_t AddressBufferSize = 13;
+     char AddressBuffer[AddressBufferSize];
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
+         CUDA_SUCCESS);
+     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
+-    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
++    detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                 12);
+     return ReturnValue(AddressBuffer,
+                        strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+index 52c4c3895..55c56aee2 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+@@ -806,7 +806,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
+   case CU_AD_FORMAT_FLOAT:
+     return 4;
+   default:
+-    sycl::detail::ur::die("Invalid image format.");
++    detail::ur::die("Invalid image format.");
+     return 0;
+   }
+ }
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
+index 8916197b7..9d86189b9 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
+@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() {
+   try {
+     EventID = Queue->getNextEventID();
+     if (EventID == 0) {
+-      sycl::detail::ur::die(
++      detail::ur::die(
+           "Unrecoverable program state reached in event identifier overflow");
+     }
+     Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream));
+@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
+   case UR_EVENT_INFO_CONTEXT:
+     return ReturnValue(hEvent->getContext());
+   default:
+-    sycl::detail::ur::die("Event info request not implemented");
++    detail::ur::die("Event info request not implemented");
+   }
+ 
+   return UR_RESULT_ERROR_INVALID_ENUMERATION;
+@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
+   default:
+     break;
+   }
+-  sycl::detail::ur::die("Event Profiling info request not implemented");
++  detail::ur::die("Event Profiling info request not implemented");
+   return {};
+ }
+ 
+@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
+                                                        ur_execution_info_t,
+                                                        ur_event_callback_t,
+                                                        void *) {
+-  sycl::detail::ur::die("Event Callback not implemented in CUDA adapter");
++  detail::ur::die("Event Callback not implemented in CUDA adapter");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -254,7 +254,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
+ 
+   const auto RefCount = hEvent->incrementReferenceCount();
+ 
+-  sycl::detail::ur::assertion(
++  detail::ur::assertion(
+       RefCount != 0, "Reference count overflow detected in urEventRetain.");
+ 
+   return UR_RESULT_SUCCESS;
+@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
+ 
+   // double delete or someone is messing with the ref count.
+   // either way, cannot safely proceed.
+-  sycl::detail::ur::assertion(
++  detail::ur::assertion(
+       hEvent->getReferenceCount() != 0,
+       "Reference count overflow detected in urEventRelease.");
+ 
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+index 358f59c49..cae080401 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+     size_t GlobalWorkSize[3] = {0, 0, 0};
+ 
+     int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+     int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                              hDevice->get()) == CUDA_SUCCESS);
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                              hDevice->get()) == CUDA_SUCCESS);
+ 
+@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   }
+   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+     int MaxThreads = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                            hKernel->get()) == CUDA_SUCCESS);
+     return ReturnValue(size_t(MaxThreads));
+@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+     // OpenCL LOCAL == CUDA SHARED
+     int Bytes = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                            hKernel->get()) == CUDA_SUCCESS);
+     return ReturnValue(uint64_t(Bytes));
+@@ -130,7 +130,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+     // Work groups should be multiples of the warp size
+     int WarpSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                              hDevice->get()) == CUDA_SUCCESS);
+     return ReturnValue(static_cast<size_t>(WarpSize));
+@@ -138,7 +138,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+     // OpenCL PRIVATE == CUDA LOCAL
+     int Bytes = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                            hKernel->get()) == CUDA_SUCCESS);
+     return ReturnValue(uint64_t(Bytes));
+@@ -231,7 +231,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
+     return ReturnValue("");
+   case UR_KERNEL_INFO_NUM_REGS: {
+     int NumRegs = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                            hKernel->get()) == CUDA_SUCCESS);
+     return ReturnValue(static_cast<uint32_t>(NumRegs));
+@@ -254,7 +254,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
+     // Sub-group size is equivalent to warp size
+     int WarpSize = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                              hDevice->get()) == CUDA_SUCCESS);
+     return ReturnValue(static_cast<uint32_t>(WarpSize));
+@@ -262,7 +262,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
+     // Number of sub-groups = max block size / warp size + possible remainder
+     int MaxThreads = 0;
+-    sycl::detail::ur::assertion(
++    detail::ur::assertion(
+         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                            hKernel->get()) == CUDA_SUCCESS);
+     int WarpSize = 0;
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+index b19acea31..ecf840330 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+@@ -162,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
+     // error for which it is unclear if the function that reported it succeeded
+     // or not. Either way, the state of the program is compromised and likely
+     // unrecoverable.
+-    sycl::detail::ur::die(
++    detail::ur::die(
+         "Unrecoverable program state reached in urMemRelease");
+   }
+ 
+@@ -331,7 +331,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+     PixelTypeSizeBytes = 4;
+     break;
+   default:
+-    sycl::detail::ur::die(
++    detail::ur::die(
+         "urMemImageCreate given unsupported image_channel_data_type");
+   }
+ 
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+index 05443eeed..32391fec5 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
+   else if (CuFlags == CU_STREAM_NON_BLOCKING)
+     Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM;
+   else
+-    sycl::detail::ur::die("Unknown cuda stream");
++    detail::ur::die("Unknown cuda stream");
+ 
+   std::vector<CUstream> ComputeCuStreams(1, CuStream);
+   std::vector<CUstream> TransferCuStreams(0);
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
+index 36ec89fb9..836e47f98 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
+@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) {
+ 
+   // double delete or someone is messing with the ref count.
+   // either way, cannot safely proceed.
+-  sycl::detail::ur::assertion(
++  detail::ur::assertion(
+       hSampler->getReferenceCount() != 0,
+       "Reference count overflow detected in urSamplerRelease.");
+ 
+-- 
+2.41.0
+
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 1d77f91a55..fef3d99888 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -5,6 +5,19 @@
 
 add_subdirectory(null)
 
+
+if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
+    # fetch adapter sources from SYCL
+    set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
+    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
+    execute_process(COMMAND git apply ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+        WORKING_DIRECTORY ${SYCL_ADAPTER_DIR})
+endif()
+
 if(UR_BUILD_ADAPTER_L0)
 add_subdirectory(level_zero)
 endif()
+
+if(UR_BUILD_ADAPTER_CUDA)
+add_subdirectory(cuda)
+endif()
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
new file mode 100644
index 0000000000..d0bc3fd6b2
--- /dev/null
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Copyright (C) 2022 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda")
+
+set(TARGET_NAME ur_adapter_cuda)
+
+add_library(${TARGET_NAME}
+    SHARED
+    ${CUDA_DIR}/ur_interface_loader.cpp
+    ${CUDA_DIR}/common.hpp
+    ${CUDA_DIR}/common.cpp
+    ${CUDA_DIR}/context.hpp
+    ${CUDA_DIR}/context.cpp
+    ${CUDA_DIR}/device.hpp
+    ${CUDA_DIR}/device.cpp
+    ${CUDA_DIR}/enqueue.cpp
+    ${CUDA_DIR}/event.hpp
+    ${CUDA_DIR}/event.cpp
+    ${CUDA_DIR}/kernel.hpp
+    ${CUDA_DIR}/kernel.cpp
+    ${CUDA_DIR}/memory.hpp
+    ${CUDA_DIR}/memory.cpp
+    ${CUDA_DIR}/platform.hpp
+    ${CUDA_DIR}/platform.cpp
+    ${CUDA_DIR}/program.hpp
+    ${CUDA_DIR}/program.cpp
+    ${CUDA_DIR}/queue.hpp
+    ${CUDA_DIR}/queue.cpp
+    ${CUDA_DIR}/sampler.hpp
+    ${CUDA_DIR}/sampler.cpp
+    ${CUDA_DIR}/tracing.cpp
+    ${CUDA_DIR}/usm.cpp
+    ${CUDA_DIR}/../../ur.cpp
+    ${CUDA_DIR}/../../ur.hpp
+    ${CUDA_DIR}/../../usm_allocator.cpp
+    ${CUDA_DIR}/../../usm_allocator.hpp
+    ${CUDA_DIR}/../../usm_allocator_config.cpp
+    ${CUDA_DIR}/../../usm_allocator_config.hpp
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
+    SOVERSION "${PROJECT_VERSION_MAJOR}"
+)
+
+find_package(Threads REQUIRED)
+find_package(CUDA 10.1 REQUIRED)
+
+# Make imported library global to use it within the project.
+add_library(cudadrv SHARED IMPORTED GLOBAL)
+
+if (WIN32)
+  set_target_properties(
+    cudadrv PROPERTIES 
+      IMPORTED_IMPLIB               ${CUDA_CUDA_LIBRARY}
+      INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}
+  )
+else()
+  set_target_properties(
+    cudadrv PROPERTIES 
+      IMPORTED_LOCATION             ${CUDA_CUDA_LIBRARY}
+      INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}
+  )
+endif()
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    Threads::Threads
+    cudadrv
+)
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CUDA_DIR}/../../../
+)
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 6bfd35084d..e1d483cdd4 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -3,11 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero")
-
-# fetch xpti proxy library for the tracing layer
-FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR})
+set(L0_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero")
 
 set(TARGET_NAME ur_adapter_level_zero)
 

From 44e5ed2a526ef113bba4b391b4be36669a312264 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Wed, 5 Jul 2023 13:38:25 +0200
Subject: [PATCH 05/16] suppress git warnings about applying cuda patch twice

---
 source/adapters/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index fef3d99888..d9d4d29156 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -10,7 +10,8 @@ if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
     FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
-    execute_process(COMMAND git apply ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+
+    execute_process(COMMAND git apply --quiet ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
         WORKING_DIRECTORY ${SYCL_ADAPTER_DIR})
 endif()
 

From 2b50550aee0e2a8a5319a6159bbc2705328e372b Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 12 Jul 2023 11:57:37 +0100
Subject: [PATCH 06/16] [UR] update adapter to use newer sycl plugin version

---
 cmake/helpers.cmake                           |   4 +-
 ...move-sycl-namespaces-from-ur-adapter.patch | 390 +++++++++++++-----
 source/adapters/CMakeLists.txt                |   9 +-
 3 files changed, 301 insertions(+), 102 deletions(-)

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index a4a3a70e78..0892fecdb4 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -63,7 +63,9 @@ function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST)
     message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}")
     IF(NOT EXISTS ${DEST})
         file(MAKE_DIRECTORY ${DEST})
-        execute_process(COMMAND git init -b main
+        execute_process(COMMAND git init
+            WORKING_DIRECTORY ${DEST})
+        execute_process(COMMAND git checkout -b main
             WORKING_DIRECTORY ${DEST})
         execute_process(COMMAND git remote add origin ${GIT_REPOSITORY}
             WORKING_DIRECTORY ${DEST})
diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
index e7214e94b2..8153e1cb85 100644
--- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+++ b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
@@ -1,23 +1,174 @@
-From e30434a4a9e893f24e0bb18051576f297d1f4f08 Mon Sep 17 00:00:00 2001
+From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001
 From: pbalcer <piotr.balcer@intel.com>
-Date: Thu, 29 Jun 2023 14:26:26 +0200
-Subject: [PATCH] [SYCL][CUDA] remove sycl namespaces from ur adapter
+Date: Thu, 29 Jun 2023 15:11:43 +0200
+Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter
 
+This was preventing out-of-tree build of the adapter for standalone
+use with unified runtime.
+
+Signed-off-by: Piotr Balcer <piotr.balcer@intel.com>
 ---
+ .../ur/adapters/cuda/command_buffer.cpp       |  52 ++---
  .../ur/adapters/cuda/common.cpp               |   6 +-
  .../ur/adapters/cuda/common.hpp               |   5 -
  .../ur/adapters/cuda/context.cpp              |   2 +-
- .../ur/adapters/cuda/device.cpp               | 170 +++++++++---------
+ .../ur/adapters/cuda/device.cpp               | 209 +++++++++---------
  .../ur/adapters/cuda/enqueue.cpp              |   2 +-
- .../ur/adapters/cuda/event.cpp                |  12 +-
- .../ur/adapters/cuda/kernel.cpp               |  26 +--
- .../ur/adapters/cuda/memory.cpp               |   4 +-
+ .../ur/adapters/cuda/event.cpp                |  17 +-
+ .../ur/adapters/cuda/kernel.cpp               |  42 ++--
+ .../ur/adapters/cuda/memory.cpp               |   5 +-
  .../ur/adapters/cuda/queue.cpp                |   2 +-
  .../ur/adapters/cuda/sampler.cpp              |   2 +-
- 10 files changed, 113 insertions(+), 118 deletions(-)
+ 11 files changed, 167 insertions(+), 177 deletions(-)
 
+diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
+index c83e9e732303..57956cb64a67 100644
+--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
+@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
+   (void)hDevice;
+   (void)pCommandBufferDesc;
+   (void)phCommandBuffer;
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
+ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+   (void)hCommandBuffer;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
+ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+   (void)hCommandBuffer;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
+ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+   (void)hCommandBuffer;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp(
+   (void)pSyncPointWaitList;
+   (void)pSyncPoint;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+ 
+@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
+   (void)phEventWaitList;
+   (void)phEvent;
+ 
+-  sycl::detail::ur::die("Experimental Command-buffer feature is not "
+-                        "implemented for CUDA adapter.");
++  detail::ur::die("Experimental Command-buffer feature is not "
++                  "implemented for CUDA adapter.");
+   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-index 86975e509..83264160e 100644
+index 86975e509725..83264160e700 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
 @@ -72,17 +72,17 @@ std::string getCudaVersionString() {
@@ -42,7 +193,7 @@ index 86975e509..83264160e 100644
  }
  
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-index 5cfa60901..82b38c10d 100644
+index 5cfa609018b2..82b38c10d449 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
 @@ -8,7 +8,6 @@
@@ -69,7 +220,7 @@ index 5cfa60901..82b38c10d 100644
 -} // __SYCL_INLINE_VER_NAMESPACE(_V1)
 -} // namespace sycl
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-index 74a32bdac..2b621383d 100644
+index 74a32bdac274..2b621383da09 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
 @@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
@@ -82,7 +233,7 @@ index 74a32bdac..2b621383d 100644
                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                               hContext->getDevice()->get()) == CUDA_SUCCESS);
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-index 24f9d52a0..c6b6bc07e 100644
+index 52d4e3badc8f..a81599d629a7 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
 @@ -15,7 +15,7 @@
@@ -177,7 +328,7 @@ index 24f9d52a0..c6b6bc07e 100644
  
      return ReturnValue(size_t(MaxWorkGroupSize));
    }
-@@ -172,12 +172,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
      // Number of sub-groups = max block size / warp size + possible remainder
      int MaxThreads = 0;
@@ -188,10 +339,14 @@ index 24f9d52a0..c6b6bc07e 100644
                               hDevice->get()) == CUDA_SUCCESS);
      int WarpSize = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
++                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
++                                               hDevice->get()) == CUDA_SUCCESS);
      int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+     return ReturnValue(MaxWarps);
+   }
 @@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      // Volta provides independent thread scheduling
      // TODO: Revisit for previous generation GPUs
@@ -228,37 +383,43 @@ index 24f9d52a0..c6b6bc07e 100644
          cuDeviceGetAttribute(&Major,
                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                               hDevice->get()) == CUDA_SUCCESS);
-@@ -266,7 +266,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
      // NVIDIA devices only support one sub-group size (the warp size)
      int WarpSize = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
++                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
++                                               hDevice->get()) == CUDA_SUCCESS);
      size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
-@@ -274,10 +274,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     return ReturnValue(Sizes, 1);
    }
    case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
      int ClockFreq = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+-                             hDevice->get()) == CUDA_SUCCESS);
 -    sycl::detail::ur::assertion(ClockFreq >= 0);
++    detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq,
++                                               CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
++                                               hDevice->get()) == CUDA_SUCCESS);
 +    detail::ur::assertion(ClockFreq >= 0);
      return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
    }
    case UR_DEVICE_INFO_ADDRESS_BITS: {
-@@ -292,7 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      // CL_DEVICE_TYPE_CUSTOM.
  
      size_t Global = 0;
 -    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
+-                                CUDA_SUCCESS);
 +    detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
-                                 CUDA_SUCCESS);
++                          CUDA_SUCCESS);
  
      auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
+ 
 @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
        Enabled = true;
@@ -426,8 +587,9 @@ index 24f9d52a0..c6b6bc07e 100644
      size_t Bytes = 0;
      // Runtime API has easy access to this value, driver API info is scarse.
 -    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
+-                                CUDA_SUCCESS);
 +    detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
-                                 CUDA_SUCCESS);
++                          CUDA_SUCCESS);
      return ReturnValue(uint64_t{Bytes});
    }
    case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
@@ -458,9 +620,11 @@ index 24f9d52a0..c6b6bc07e 100644
    case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
      int ECCEnabled = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled,
++                                               CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
++                                               hDevice->get()) == CUDA_SUCCESS);
  
 -    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
 +    detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
@@ -470,25 +634,30 @@ index 24f9d52a0..c6b6bc07e 100644
    case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
      int IsIntegrated = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated,
++                                               CU_DEVICE_ATTRIBUTE_INTEGRATED,
++                                               hDevice->get()) == CUDA_SUCCESS);
  
 -    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
 +    detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
      auto result = static_cast<bool>(IsIntegrated);
      return ReturnValue(result);
    }
-@@ -620,7 +620,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_NAME: {
      static constexpr size_t MaxDeviceNameLength = 256u;
      char Name[MaxDeviceNameLength];
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
-         CUDA_SUCCESS);
+-        cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
+-        CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength,
++                                          hDevice->get()) == CUDA_SUCCESS);
      return ReturnValue(Name, strlen(Name) + 1);
-@@ -641,13 +641,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+   }
+   case UR_DEVICE_INFO_VENDOR: {
+@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_VERSION: {
      std::stringstream SS;
      int Major;
@@ -504,7 +673,7 @@ index 24f9d52a0..c6b6bc07e 100644
          cuDeviceGetAttribute(&Minor,
                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                               hDevice->get()) == CUDA_SUCCESS);
-@@ -666,11 +666,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      int Major = 0;
      int Minor = 0;
  
@@ -518,14 +687,16 @@ index 24f9d52a0..c6b6bc07e 100644
          cuDeviceGetAttribute(&Minor,
                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                               hDevice->get()) == CUDA_SUCCESS);
-@@ -847,27 +847,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
      size_t FreeMemory = 0;
      size_t TotalMemory = 0;
 -    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
+-                                    CUDA_SUCCESS,
+-                                "failed cuMemGetInfo() API.");
 +    detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
-                                     CUDA_SUCCESS,
-                                 "failed cuMemGetInfo() API.");
++                              CUDA_SUCCESS,
++                          "failed cuMemGetInfo() API.");
      return ReturnValue(FreeMemory);
    }
    case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
@@ -551,7 +722,7 @@ index 24f9d52a0..c6b6bc07e 100644
      return ReturnValue(Value);
    }
    case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-@@ -875,10 +875,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    }
    case UR_DEVICE_INFO_DEVICE_ID: {
      int Value = 0;
@@ -564,20 +735,21 @@ index 24f9d52a0..c6b6bc07e 100644
      return ReturnValue(Value);
    }
    case UR_DEVICE_INFO_UUID: {
-@@ -888,10 +888,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     int Minor = DriverVersion % 1000 / 10;
      CUuuid UUID;
-     if ((Major > 11) || (Major == 11 && Minor >= 4)) {
--      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
-+      detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
-                                   CUDA_SUCCESS);
-     } else {
--      sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
-+      detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
-                                   CUDA_SUCCESS);
-     }
+ #if (CUDA_VERSION >= 11040)
+-    sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
+-                                CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
++                          CUDA_SUCCESS);
+ #else
+-    sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
+-                                CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
++                          CUDA_SUCCESS);
+ #endif
      std::array<unsigned char, 16> Name;
-@@ -900,13 +900,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+     std::copy(UUID.bytes, UUID.bytes + 16, Name.begin());
+@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    }
    case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
      int Major = 0;
@@ -593,7 +765,7 @@ index 24f9d52a0..c6b6bc07e 100644
          cuDeviceGetAttribute(&Minor,
                               CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                               hDevice->get()) == CUDA_SUCCESS);
-@@ -922,7 +922,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      } else if (IsOrinAGX) {
        MemoryClockKHz = 3200000;
      } else {
@@ -602,7 +774,7 @@ index 24f9d52a0..c6b6bc07e 100644
            cuDeviceGetAttribute(&MemoryClockKHz,
                                 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                                 hDevice->get()) == CUDA_SUCCESS);
-@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
      if (IsOrinAGX) {
        MemoryBusWidth = 256;
      } else {
@@ -611,7 +783,7 @@ index 24f9d52a0..c6b6bc07e 100644
            cuDeviceGetAttribute(&MemoryBusWidth,
                                 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
                                 hDevice->get()) == CUDA_SUCCESS);
-@@ -977,7 +977,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
          &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
          hDevice->get()));
  
@@ -620,25 +792,27 @@ index 24f9d52a0..c6b6bc07e 100644
  
      return ReturnValue(static_cast<uint32_t>(MaxRegisters));
    }
-@@ -988,11 +988,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    case UR_DEVICE_INFO_PCI_ADDRESS: {
      constexpr size_t AddressBufferSize = 13;
      char AddressBuffer[AddressBufferSize];
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
-         CUDA_SUCCESS);
+-        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
+-        CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize,
++                                              hDevice->get()) == CUDA_SUCCESS);
      // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
 -    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
-+    detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
-                                 12);
+-                                12);
++    detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
      return ReturnValue(AddressBuffer,
                         strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
+   }
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-index 52c4c3895..55c56aee2 100644
+index 1cfc5cc40a4a..792f69092682 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-@@ -806,7 +806,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
+@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
    case CU_AD_FORMAT_FLOAT:
      return 4;
    default:
@@ -648,7 +822,7 @@ index 52c4c3895..55c56aee2 100644
    }
  }
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-index 8916197b7..9d86189b9 100644
+index 8916197b73f1..066c0498f1d0 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
 @@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() {
@@ -687,26 +861,31 @@ index 8916197b7..9d86189b9 100644
    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
  }
  
-@@ -254,7 +254,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
+@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
  
    const auto RefCount = hEvent->incrementReferenceCount();
  
 -  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       RefCount != 0, "Reference count overflow detected in urEventRetain.");
+-      RefCount != 0, "Reference count overflow detected in urEventRetain.");
++  detail::ur::assertion(RefCount != 0,
++                        "Reference count overflow detected in urEventRetain.");
  
    return UR_RESULT_SUCCESS;
-@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
+ }
+@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
  
    // double delete or someone is messing with the ref count.
    // either way, cannot safely proceed.
 -  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       hEvent->getReferenceCount() != 0,
-       "Reference count overflow detected in urEventRelease.");
+-      hEvent->getReferenceCount() != 0,
+-      "Reference count overflow detected in urEventRelease.");
++  detail::ur::assertion(hEvent->getReferenceCount() != 0,
++                        "Reference count overflow detected in urEventRelease.");
  
+   // decrement ref count. If it is 0, delete the event.
+   if (hEvent->decrementReferenceCount() == 0) {
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-index 358f59c49..cae080401 100644
+index 358f59c499e1..7d46ce039bab 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
 @@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
@@ -758,43 +937,55 @@ index 358f59c49..cae080401 100644
          cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
                             hKernel->get()) == CUDA_SUCCESS);
      return ReturnValue(uint64_t(Bytes));
-@@ -130,7 +130,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
    case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
      // Work groups should be multiples of the warp size
      int WarpSize = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
++                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
++                                               hDevice->get()) == CUDA_SUCCESS);
      return ReturnValue(static_cast<size_t>(WarpSize));
-@@ -138,7 +138,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   }
    case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
      // OpenCL PRIVATE == CUDA LOCAL
      int Bytes = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                            hKernel->get()) == CUDA_SUCCESS);
+-        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+-                           hKernel->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuFuncGetAttribute(&Bytes,
++                                             CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
++                                             hKernel->get()) == CUDA_SUCCESS);
      return ReturnValue(uint64_t(Bytes));
-@@ -231,7 +231,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
+   }
+   default:
+@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
      return ReturnValue("");
    case UR_KERNEL_INFO_NUM_REGS: {
      int NumRegs = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                            hKernel->get()) == CUDA_SUCCESS);
+-        cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+-                           hKernel->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuFuncGetAttribute(&NumRegs,
++                                             CU_FUNC_ATTRIBUTE_NUM_REGS,
++                                             hKernel->get()) == CUDA_SUCCESS);
      return ReturnValue(static_cast<uint32_t>(NumRegs));
-@@ -254,7 +254,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   }
+   default:
+@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
    case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
      // Sub-group size is equivalent to warp size
      int WarpSize = 0;
 -    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
+-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+-                             hDevice->get()) == CUDA_SUCCESS);
++    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
++                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
++                                               hDevice->get()) == CUDA_SUCCESS);
      return ReturnValue(static_cast<uint32_t>(WarpSize));
-@@ -262,7 +262,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+   }
    case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
      // Number of sub-groups = max block size / warp size + possible remainder
      int MaxThreads = 0;
@@ -804,19 +995,20 @@ index 358f59c49..cae080401 100644
                             hKernel->get()) == CUDA_SUCCESS);
      int WarpSize = 0;
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-index b19acea31..ecf840330 100644
+index b19acea3159f..f0c276579476 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-@@ -162,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
+@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
      // error for which it is unclear if the function that reported it succeeded
      // or not. Either way, the state of the program is compromised and likely
      // unrecoverable.
 -    sycl::detail::ur::die(
-+    detail::ur::die(
-         "Unrecoverable program state reached in urMemRelease");
+-        "Unrecoverable program state reached in urMemRelease");
++    detail::ur::die("Unrecoverable program state reached in urMemRelease");
    }
  
-@@ -331,7 +331,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+   return UR_RESULT_SUCCESS;
+@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
      PixelTypeSizeBytes = 4;
      break;
    default:
@@ -826,7 +1018,7 @@ index b19acea31..ecf840330 100644
    }
  
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-index 05443eeed..32391fec5 100644
+index 05443eeed89d..32391fec5c13 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
 @@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
@@ -839,7 +1031,7 @@ index 05443eeed..32391fec5 100644
    std::vector<CUstream> ComputeCuStreams(1, CuStream);
    std::vector<CUstream> TransferCuStreams(0);
 diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-index 36ec89fb9..836e47f98 100644
+index 36ec89fb9da3..836e47f988e5 100644
 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
 +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
 @@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) {
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index d9d4d29156..9a817f4abe 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -9,9 +9,14 @@ add_subdirectory(null)
 if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
+    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
 
-    execute_process(COMMAND git apply --quiet ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+    get_program_version_major_minor(git GIT_VERSION)
+    set(GIT_QUIET_OPTION "")
+    if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0")
+        set(GIT_QUIET_OPTION "--quiet")
+    endif()
+    execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
         WORKING_DIRECTORY ${SYCL_ADAPTER_DIR})
 endif()
 

From 7c2db62b18ea170fcf13fbca1cdc1ded92a81c11 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 12 Jul 2023 12:09:19 +0100
Subject: [PATCH 07/16] [UR] Fix L0 plugin file names

---
 source/adapters/level_zero/CMakeLists.txt | 46 +++++++++++------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index e1d483cdd4..e9b4aaa547 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -73,30 +73,30 @@ target_include_directories(LevelZeroLoader-Headers
 
 add_library(${TARGET_NAME}
     SHARED
-    ${L0_DIR}/ur_loader_interface.cpp
-    ${L0_DIR}/ur_level_zero_common.hpp
-    ${L0_DIR}/ur_level_zero_context.hpp
-    ${L0_DIR}/ur_level_zero_device.hpp
-    ${L0_DIR}/ur_level_zero_event.hpp
-    ${L0_DIR}/ur_level_zero_usm.hpp
-    ${L0_DIR}/ur_level_zero_mem.hpp
-    ${L0_DIR}/ur_level_zero_kernel.hpp
-    ${L0_DIR}/ur_level_zero_platform.hpp
-    ${L0_DIR}/ur_level_zero_program.hpp
-    ${L0_DIR}/ur_level_zero_queue.hpp
-    ${L0_DIR}/ur_level_zero_sampler.hpp
+    ${L0_DIR}/ur_interface_loader.cpp
+    ${L0_DIR}/common.hpp
+    ${L0_DIR}/context.hpp
+    ${L0_DIR}/device.hpp
+    ${L0_DIR}/event.hpp
+    ${L0_DIR}/usm.hpp
+    ${L0_DIR}/memory.hpp
+    ${L0_DIR}/kernel.hpp
+    ${L0_DIR}/platform.hpp
+    ${L0_DIR}/program.hpp
+    ${L0_DIR}/queue.hpp
+    ${L0_DIR}/sampler.hpp
     ${L0_DIR}/ur_level_zero.cpp
-    ${L0_DIR}/ur_level_zero_common.cpp
-    ${L0_DIR}/ur_level_zero_context.cpp
-    ${L0_DIR}/ur_level_zero_device.cpp
-    ${L0_DIR}/ur_level_zero_event.cpp
-    ${L0_DIR}/ur_level_zero_usm.cpp
-    ${L0_DIR}/ur_level_zero_mem.cpp
-    ${L0_DIR}/ur_level_zero_kernel.cpp
-    ${L0_DIR}/ur_level_zero_platform.cpp
-    ${L0_DIR}/ur_level_zero_program.cpp
-    ${L0_DIR}/ur_level_zero_queue.cpp
-    ${L0_DIR}/ur_level_zero_sampler.cpp
+    ${L0_DIR}/common.cpp
+    ${L0_DIR}/context.cpp
+    ${L0_DIR}/device.cpp
+    ${L0_DIR}/event.cpp
+    ${L0_DIR}/usm.cpp
+    ${L0_DIR}/memory.cpp
+    ${L0_DIR}/kernel.cpp
+    ${L0_DIR}/platform.cpp
+    ${L0_DIR}/program.cpp
+    ${L0_DIR}/queue.cpp
+    ${L0_DIR}/sampler.cpp
     ${L0_DIR}/../../ur.cpp
     ${L0_DIR}/../../usm_allocator.cpp
     ${L0_DIR}/../../usm_allocator.hpp

From 43f421691c8d7b2e5cb3466498e09428122ab885 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 12 Jul 2023 13:25:16 +0100
Subject: [PATCH 08/16] [UR] Bump L0 Loader version

---
 source/adapters/level_zero/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index e9b4aaa547..52d22c3ea6 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -11,7 +11,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
     message(STATUS "Download Level Zero loader and headers from github.com")
 
     set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
-    set(LEVEL_ZERO_LOADER_TAG v1.8.8)
+    set(LEVEL_ZERO_LOADER_TAG v1.11.0)
 
     # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
     set(CMAKE_INCLUDE_CURRENT_DIR OFF)

From 40c50865883f70d191b1cc891373fdc0d1a7be02 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Fri, 14 Jul 2023 15:14:55 +0100
Subject: [PATCH 09/16] [UR][HIP] Add ur hip target build to ur repo

---
 CMakeLists.txt                     |   1 +
 README.md                          |   2 +
 source/adapters/CMakeLists.txt     |  12 ++-
 source/adapters/hip/CMakeLists.txt | 144 +++++++++++++++++++++++++++++
 4 files changed, 155 insertions(+), 4 deletions(-)
 create mode 100644 source/adapters/hip/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bedb617d8c..42f42642a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace
 option(UR_BUILD_TOOLS "build ur tools" ON)
 option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
 option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF)
+option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/README.md b/README.md
index e9ccf4a3a3..a9a1e0cabb 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,8 @@ List of options provided by CMake:
 | UR_BUILD_TOOLS | Build tools | ON/OFF | ON |
 | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF |
 | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
+| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
+| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
 
 **General**:
 
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 9a817f4abe..57df48f95f 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -6,10 +6,10 @@
 add_subdirectory(null)
 
 
-if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
+if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
+    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
 
     get_program_version_major_minor(git GIT_VERSION)
     set(GIT_QUIET_OPTION "")
@@ -21,9 +21,13 @@ if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
 endif()
 
 if(UR_BUILD_ADAPTER_L0)
-add_subdirectory(level_zero)
+    add_subdirectory(level_zero)
 endif()
 
 if(UR_BUILD_ADAPTER_CUDA)
-add_subdirectory(cuda)
+    add_subdirectory(cuda)
+endif()
+
+if(UR_BUILD_ADAPTER_HIP)
+    add_subdirectory(hip)
 endif()
diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
new file mode 100644
index 0000000000..82ab485119
--- /dev/null
+++ b/source/adapters/hip/CMakeLists.txt
@@ -0,0 +1,144 @@
+# Copyright (C) 2022 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(HIP_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/hip")
+
+set(TARGET_NAME ur_adapter_hip)
+
+# Set default UR HIP platform to AMD
+set(UR_HIP_PLATFORM "AMD" CACHE STRING "UR HIP platform, AMD or NVIDIA")
+
+# Set default ROCm installation directory
+set(UR_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir")
+
+set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
+
+set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")
+
+# Set HIP lib dir
+set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
+
+# Check if HIP library path exists (AMD platform only)
+if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
+    if(NOT EXISTS "${UR_HIP_LIB_DIR}")
+    message(FATAL_ERROR "Couldn't find the HIP library directory at '${UR_HIP_LIB_DIR}',"
+                        " please check ROCm installation.")
+    endif()
+    # Check if HIP include path exists
+    if(NOT EXISTS "${UR_HIP_INCLUDE_DIR}")
+    message(FATAL_ERROR "Couldn't find the HIP include directory at '${UR_HIP_INCLUDE_DIR}',"
+                        " please check ROCm installation.")
+    endif()
+
+    # Check if HSA include path exists
+    if(NOT EXISTS "${UR_HIP_HSA_INCLUDE_DIR}")
+    message(FATAL_ERROR "Couldn't find the HSA include directory at '${UR_HIP_HSA_INCLUDE_DIR}',"
+                        " please check ROCm installation.")
+    endif()
+endif()
+
+# Set includes used in added library (rocmdrv)
+set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}")
+
+add_library(${TARGET_NAME}
+    SHARED
+    ${HIP_DIR}/ur_interface_loader.cpp
+    ${HIP_DIR}/common.hpp
+    ${HIP_DIR}/common.cpp
+    ${HIP_DIR}/context.hpp
+    ${HIP_DIR}/context.cpp
+    ${HIP_DIR}/device.hpp
+    ${HIP_DIR}/device.cpp
+    ${HIP_DIR}/enqueue.cpp
+    ${HIP_DIR}/event.hpp
+    ${HIP_DIR}/event.cpp
+    ${HIP_DIR}/kernel.hpp
+    ${HIP_DIR}/kernel.cpp
+    ${HIP_DIR}/memory.hpp
+    ${HIP_DIR}/memory.cpp
+    ${HIP_DIR}/platform.hpp
+    ${HIP_DIR}/platform.cpp
+    ${HIP_DIR}/program.hpp
+    ${HIP_DIR}/program.cpp
+    ${HIP_DIR}/queue.hpp
+    ${HIP_DIR}/queue.cpp
+    ${HIP_DIR}/sampler.hpp
+    ${HIP_DIR}/sampler.cpp
+    ${HIP_DIR}/usm.cpp
+    ${HIP_DIR}/../../ur.cpp
+    ${HIP_DIR}/../../ur.hpp
+    ${HIP_DIR}/../../usm_allocator.cpp
+    ${HIP_DIR}/../../usm_allocator.hpp
+    ${HIP_DIR}/../../usm_allocator_config.cpp
+    ${HIP_DIR}/../../usm_allocator_config.hpp
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
+    SOVERSION "${PROJECT_VERSION_MAJOR}"
+)
+
+if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
+    # Import HIP runtime library
+    add_library(rocmdrv SHARED IMPORTED GLOBAL)
+
+    set_target_properties(
+    rocmdrv PROPERTIES
+        IMPORTED_LOCATION                    "${UR_HIP_LIB_DIR}/libamdhip64.so"
+        INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+    )
+
+    target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    rocmdrv
+    )
+
+    # Set HIP define to select AMD platform
+    target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_AMD__)
+elseif("${UR_HIP_PLATFORM}" STREQUAL "NVIDIA")
+    # Import CUDA libraries
+    find_package(CUDA REQUIRED)
+    find_package(Threads REQUIRED)
+
+    list(APPEND HIP_HEADERS ${CUDA_INCLUDE_DIRS})
+
+    # cudadrv may be defined by the CUDA plugin
+    if(NOT TARGET cudadrv)
+    add_library(cudadrv SHARED IMPORTED GLOBAL)
+    set_target_properties(
+        cudadrv PROPERTIES
+        IMPORTED_LOCATION                    ${CUDA_CUDA_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+    )
+    endif()
+
+    add_library(cudart SHARED IMPORTED GLOBAL)
+    set_target_properties(
+    cudart PROPERTIES
+        IMPORTED_LOCATION                    ${CUDA_CUDART_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+    )
+
+    target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    Threads::Threads
+    cudadrv
+    cudart
+    )
+
+    # Set HIP define to select NVIDIA platform
+    target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_NVIDIA__)
+else()
+    message(FATAL_ERROR "Unspecified UR HIP platform please set UR_HIP_PLATFORM to 'AMD' or 'NVIDIA'")
+endif()
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${HIP_DIR}/../../../
+)

From 896a732d05944a6b8919a6743d5c04b90a9c95e1 Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Tue, 18 Jul 2023 11:01:03 +0200
Subject: [PATCH 10/16] update sycl adapters tag and add adapter CI

---
 .github/workflows/cmake.yml                   |   50 +-
 ...move-sycl-namespaces-from-ur-adapter.patch | 1048 -----------------
 source/adapters/CMakeLists.txt                |   10 +-
 source/adapters/hip/CMakeLists.txt            |    6 +
 4 files changed, 56 insertions(+), 1058 deletions(-)
 delete mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index ab3f7ec061..e9f4ba2594 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -66,7 +66,6 @@ jobs:
         -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
         -DUR_BUILD_TESTS=ON
         -DUR_FORMAT_CPP_STYLE=ON
-        -DUR_BUILD_ADAPTER_L0=ON
         ${{matrix.libbacktrace}}
 
     - name: Generate source from spec, check for uncommitted diff
@@ -80,6 +79,55 @@ jobs:
       working-directory: ${{github.workspace}}/build
       run: ctest -C ${{matrix.build_type}} --output-on-failure -L "python|uma|loader|validation|tracing|unit|urtrace"
 
+  adapter-build:
+    name: Build - Adapters on Ubuntu
+    strategy:
+      matrix:
+        os: ['ubuntu-22.04']
+        adapter: [CUDA, HIP, L0]
+        build_type: [Debug, Release]
+        compiler: [{c: gcc, cxx: g++}, {c: clang, cxx: clang++}]
+
+    runs-on: ${{matrix.os}}
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install apt packages
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y doxygen ${{matrix.compiler.c}}
+
+    - name: Install pip packages
+      run: pip install -r third_party/requirements.txt
+
+    - name: Install CUDA specific dependencies
+      if: matrix.adapter == 'CUDA'
+      run: |
+        sudo apt-get install nvidia-cuda-toolkit nvidia-cuda-toolkit-gcc
+
+    - name: Install HIP specific dependencies
+      if: matrix.adapter == 'HIP'
+      run: |
+        wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/jammy/amdgpu-install_5.6.50600-1_all.deb
+        sudo apt install ./amdgpu-install_5.6.50600-1_all.deb
+        sudo amdgpu-install --usecase=rocm
+
+    - name: Configure CMake
+      run: >
+        cmake
+        -B${{github.workspace}}/build
+        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
+        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
+        -DUR_ENABLE_TRACING=ON
+        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
+        -DUR_BUILD_TESTS=ON
+        -DUR_FORMAT_CPP_STYLE=ON
+        -DUR_BUILD_ADAPTER_${{matrix.adapter}}=ON
+
+    - name: Build
+      run: cmake --build ${{github.workspace}}/build -j $(nproc)
+
   windows-build:
     name: Build - Windows
     strategy:
diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
deleted file mode 100644
index 8153e1cb85..0000000000
--- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+++ /dev/null
@@ -1,1048 +0,0 @@
-From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001
-From: pbalcer <piotr.balcer@intel.com>
-Date: Thu, 29 Jun 2023 15:11:43 +0200
-Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter
-
-This was preventing out-of-tree build of the adapter for standalone
-use with unified runtime.
-
-Signed-off-by: Piotr Balcer <piotr.balcer@intel.com>
----
- .../ur/adapters/cuda/command_buffer.cpp       |  52 ++---
- .../ur/adapters/cuda/common.cpp               |   6 +-
- .../ur/adapters/cuda/common.hpp               |   5 -
- .../ur/adapters/cuda/context.cpp              |   2 +-
- .../ur/adapters/cuda/device.cpp               | 209 +++++++++---------
- .../ur/adapters/cuda/enqueue.cpp              |   2 +-
- .../ur/adapters/cuda/event.cpp                |  17 +-
- .../ur/adapters/cuda/kernel.cpp               |  42 ++--
- .../ur/adapters/cuda/memory.cpp               |   5 +-
- .../ur/adapters/cuda/queue.cpp                |   2 +-
- .../ur/adapters/cuda/sampler.cpp              |   2 +-
- 11 files changed, 167 insertions(+), 177 deletions(-)
-
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-index c83e9e732303..57956cb64a67 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
-   (void)hDevice;
-   (void)pCommandBufferDesc;
-   (void)phCommandBuffer;
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
-   (void)phEventWaitList;
-   (void)phEvent;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-index 86975e509725..83264160e700 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-@@ -72,17 +72,17 @@ std::string getCudaVersionString() {
-   return stream.str();
- }
- 
--void sycl::detail::ur::die(const char *Message) {
-+void detail::ur::die(const char *Message) {
-   std::cerr << "ur_die: " << Message << std::endl;
-   std::terminate();
- }
- 
--void sycl::detail::ur::assertion(bool Condition, const char *Message) {
-+void detail::ur::assertion(bool Condition, const char *Message) {
-   if (!Condition)
-     die(Message);
- }
- 
--void sycl::detail::ur::cuPrint(const char *Message) {
-+void detail::ur::cuPrint(const char *Message) {
-   std::cerr << "ur_print: " << Message << std::endl;
- }
- 
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-index 5cfa609018b2..82b38c10d449 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-@@ -8,7 +8,6 @@
- #pragma once
- 
- #include <cuda.h>
--#include <sycl/detail/defines.hpp>
- #include <ur/ur.hpp>
- 
- ur_result_t mapErrorUR(CUresult Result);
-@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize];
-                                       ur_result_t ErrorCode);
- 
- /// ------ Error handling, matching OpenCL plugin semantics.
--namespace sycl {
--__SYCL_INLINE_VER_NAMESPACE(_V1) {
- namespace detail {
- namespace ur {
- 
-@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr);
- 
- } // namespace ur
- } // namespace detail
--} // __SYCL_INLINE_VER_NAMESPACE(_V1)
--} // namespace sycl
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-index 74a32bdac274..2b621383da09 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
-   }
-   case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hContext->getDevice()->get()) == CUDA_SUCCESS);
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-index 52d4e3badc8f..a81599d629a7 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-@@ -15,7 +15,7 @@
- 
- int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
-   int value;
--  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
-   return value;
- }
-@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-     int ComputeUnits = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ComputeUnits,
-                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ComputeUnits >= 0);
-+    detail::ur::assertion(ComputeUnits >= 0);
-     return ReturnValue(static_cast<uint32_t>(ComputeUnits));
-   }
-   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
-@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     } ReturnSizes;
- 
-     int MaxX = 0, MaxY = 0, MaxZ = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxX >= 0);
-+    detail::ur::assertion(MaxX >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxY >= 0);
-+    detail::ur::assertion(MaxY >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxZ >= 0);
-+    detail::ur::assertion(MaxZ >= 0);
- 
-     ReturnSizes.Sizes[0] = size_t(MaxX);
-     ReturnSizes.Sizes[1] = size_t(MaxY);
-@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-       size_t Sizes[MaxWorkItemDimensions];
-     } ReturnSizes;
-     int MaxX = 0, MaxY = 0, MaxZ = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxX >= 0);
-+    detail::ur::assertion(MaxX >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxY >= 0);
-+    detail::ur::assertion(MaxY >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxZ >= 0);
-+    detail::ur::assertion(MaxZ >= 0);
- 
-     ReturnSizes.Sizes[0] = size_t(MaxX);
-     ReturnSizes.Sizes[1] = size_t(MaxY);
-@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
- 
-   case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
-     int MaxWorkGroupSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxWorkGroupSize,
-                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion(MaxWorkGroupSize >= 0);
-+    detail::ur::assertion(MaxWorkGroupSize >= 0);
- 
-     return ReturnValue(size_t(MaxWorkGroupSize));
-   }
-@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
-     // Number of sub-groups = max block size / warp size + possible remainder
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxThreads,
-                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
-     return ReturnValue(MaxWarps);
-   }
-@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // Volta provides independent thread scheduling
-     // TODO: Revisit for previous generation GPUs
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
- 
-   case UR_DEVICE_INFO_ATOMIC_64: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_BFLOAT16: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-     // NVIDIA devices only support one sub-group size (the warp size)
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
-     return ReturnValue(Sizes, 1);
-   }
-   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
-     int ClockFreq = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
--                             hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ClockFreq >= 0);
-+    detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq,
-+                                               CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(ClockFreq >= 0);
-     return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
-   }
-   case UR_DEVICE_INFO_ADDRESS_BITS: {
-@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // CL_DEVICE_TYPE_CUSTOM.
- 
-     size_t Global = 0;
--    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- 
-     auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
- 
-@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
-       Enabled = true;
-     } else {
--      sycl::detail::ur::cuPrint(
-+      detail::ur::cuPrint(
-           "Images are not fully supported by the CUDA BE, their support is "
-           "disabled by default. Their partial support can be activated by "
-           "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
-@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
-     // Take the smaller of maximum surface and maximum texture height.
-     int TexHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexHeight >= 0);
-+    detail::ur::assertion(TexHeight >= 0);
-     int SurfHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfHeight >= 0);
-+    detail::ur::assertion(SurfHeight >= 0);
- 
-     int Min = std::min(TexHeight, SurfHeight);
- 
-@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
-     // Take the smaller of maximum surface and maximum texture height.
-     int TexHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexHeight >= 0);
-+    detail::ur::assertion(TexHeight >= 0);
-     int SurfHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfHeight >= 0);
-+    detail::ur::assertion(SurfHeight >= 0);
- 
-     int Min = std::min(TexHeight, SurfHeight);
- 
-@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
-     // Take the smaller of maximum surface and maximum texture depth.
-     int TexDepth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexDepth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexDepth >= 0);
-+    detail::ur::assertion(TexDepth >= 0);
-     int SurfDepth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfDepth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfDepth >= 0);
-+    detail::ur::assertion(SurfDepth >= 0);
- 
-     int Min = std::min(TexDepth, SurfDepth);
- 
-@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
-     int MemBaseAddrAlign = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MemBaseAddrAlign,
-                              CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
-     int CacheSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(CacheSize >= 0);
-+    detail::ur::assertion(CacheSize >= 0);
-     // The L2 cache is global to the GPU.
-     return ReturnValue(static_cast<uint64_t>(CacheSize));
-   }
-   case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-     size_t Bytes = 0;
-     // Runtime API has easy access to this value, driver API info is scarse.
--    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
-+                          CUDA_SUCCESS);
-     return ReturnValue(uint64_t{Bytes});
-   }
-   case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
-     int ConstantMemory = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ConstantMemory,
-                              CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ConstantMemory >= 0);
-+    detail::ur::assertion(ConstantMemory >= 0);
- 
-     return ReturnValue(static_cast<uint64_t>(ConstantMemory));
-   }
-@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // CUDA has its own definition of "local memory", which maps to OpenCL's
-     // "private memory".
-     int LocalMemSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&LocalMemSize,
-                              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(LocalMemSize >= 0);
-+    detail::ur::assertion(LocalMemSize >= 0);
-     return ReturnValue(static_cast<uint64_t>(LocalMemSize));
-   }
-   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
-     int ECCEnabled = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled,
-+                                               CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-+                                               hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
-+    detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
-     auto Result = static_cast<bool>(ECCEnabled);
-     return ReturnValue(Result);
-   }
-   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
-     int IsIntegrated = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated,
-+                                               CU_DEVICE_ATTRIBUTE_INTEGRATED,
-+                                               hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
-+    detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
-     auto result = static_cast<bool>(IsIntegrated);
-     return ReturnValue(result);
-   }
-@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_NAME: {
-     static constexpr size_t MaxDeviceNameLength = 256u;
-     char Name[MaxDeviceNameLength];
--    sycl::detail::ur::assertion(
--        cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
--        CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength,
-+                                          hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(Name, strlen(Name) + 1);
-   }
-   case UR_DEVICE_INFO_VENDOR: {
-@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_VERSION: {
-     std::stringstream SS;
-     int Major;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-     SS << Major;
-     int Minor;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     int Major = 0;
-     int Minor = 0;
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
-     size_t FreeMemory = 0;
-     size_t TotalMemory = 0;
--    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
--                                    CUDA_SUCCESS,
--                                "failed cuMemGetInfo() API.");
-+    detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
-+                              CUDA_SUCCESS,
-+                          "failed cuMemGetInfo() API.");
-     return ReturnValue(FreeMemory);
-   }
-   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     // Convert kilohertz to megahertz when returning.
-     return ReturnValue(Value / 1000);
-   }
-   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value,
-                              CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     return ReturnValue(Value);
-   }
-   case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_DEVICE_ID: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     return ReturnValue(Value);
-   }
-   case UR_DEVICE_INFO_UUID: {
-     CUuuid UUID;
- #if (CUDA_VERSION >= 11040)
--    sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- #else
--    sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- #endif
-     std::array<unsigned char, 16> Name;
-     std::copy(UUID.bytes, UUID.bytes + 16, Name.begin());
-@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-     int Minor = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     } else if (IsOrinAGX) {
-       MemoryClockKHz = 3200000;
-     } else {
--      sycl::detail::ur::assertion(
-+      detail::ur::assertion(
-           cuDeviceGetAttribute(&MemoryClockKHz,
-                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                                hDevice->get()) == CUDA_SUCCESS);
-@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     if (IsOrinAGX) {
-       MemoryBusWidth = 256;
-     } else {
--      sycl::detail::ur::assertion(
-+      detail::ur::assertion(
-           cuDeviceGetAttribute(&MemoryBusWidth,
-                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                                hDevice->get()) == CUDA_SUCCESS);
-@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-         &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-         hDevice->get()));
- 
--    sycl::detail::ur::assertion(MaxRegisters >= 0);
-+    detail::ur::assertion(MaxRegisters >= 0);
- 
-     return ReturnValue(static_cast<uint32_t>(MaxRegisters));
-   }
-@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_PCI_ADDRESS: {
-     constexpr size_t AddressBufferSize = 13;
-     char AddressBuffer[AddressBufferSize];
--    sycl::detail::ur::assertion(
--        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
--        CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize,
-+                                              hDevice->get()) == CUDA_SUCCESS);
-     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
--    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
--                                12);
-+    detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
-     return ReturnValue(AddressBuffer,
-                        strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
-   }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-index 1cfc5cc40a4a..792f69092682 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
-   case CU_AD_FORMAT_FLOAT:
-     return 4;
-   default:
--    sycl::detail::ur::die("Invalid image format.");
-+    detail::ur::die("Invalid image format.");
-     return 0;
-   }
- }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-index 8916197b73f1..066c0498f1d0 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() {
-   try {
-     EventID = Queue->getNextEventID();
-     if (EventID == 0) {
--      sycl::detail::ur::die(
-+      detail::ur::die(
-           "Unrecoverable program state reached in event identifier overflow");
-     }
-     Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream));
-@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
-   case UR_EVENT_INFO_CONTEXT:
-     return ReturnValue(hEvent->getContext());
-   default:
--    sycl::detail::ur::die("Event info request not implemented");
-+    detail::ur::die("Event info request not implemented");
-   }
- 
-   return UR_RESULT_ERROR_INVALID_ENUMERATION;
-@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
-   default:
-     break;
-   }
--  sycl::detail::ur::die("Event Profiling info request not implemented");
-+  detail::ur::die("Event Profiling info request not implemented");
-   return {};
- }
- 
-@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
-                                                        ur_execution_info_t,
-                                                        ur_event_callback_t,
-                                                        void *) {
--  sycl::detail::ur::die("Event Callback not implemented in CUDA adapter");
-+  detail::ur::die("Event Callback not implemented in CUDA adapter");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
- 
-   const auto RefCount = hEvent->incrementReferenceCount();
- 
--  sycl::detail::ur::assertion(
--      RefCount != 0, "Reference count overflow detected in urEventRetain.");
-+  detail::ur::assertion(RefCount != 0,
-+                        "Reference count overflow detected in urEventRetain.");
- 
-   return UR_RESULT_SUCCESS;
- }
-@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
- 
-   // double delete or someone is messing with the ref count.
-   // either way, cannot safely proceed.
--  sycl::detail::ur::assertion(
--      hEvent->getReferenceCount() != 0,
--      "Reference count overflow detected in urEventRelease.");
-+  detail::ur::assertion(hEvent->getReferenceCount() != 0,
-+                        "Reference count overflow detected in urEventRelease.");
- 
-   // decrement ref count. If it is 0, delete the event.
-   if (hEvent->decrementReferenceCount() == 0) {
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-index 358f59c499e1..7d46ce039bab 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-     size_t GlobalWorkSize[3] = {0, 0, 0};
- 
-     int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-     int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   }
-   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                            hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(size_t(MaxThreads));
-@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-     // OpenCL LOCAL == CUDA SHARED
-     int Bytes = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                            hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(uint64_t(Bytes));
-@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-     // Work groups should be multiples of the warp size
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<size_t>(WarpSize));
-   }
-   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-     // OpenCL PRIVATE == CUDA LOCAL
-     int Bytes = 0;
--    sycl::detail::ur::assertion(
--        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
--                           hKernel->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuFuncGetAttribute(&Bytes,
-+                                             CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-+                                             hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(uint64_t(Bytes));
-   }
-   default:
-@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
-     return ReturnValue("");
-   case UR_KERNEL_INFO_NUM_REGS: {
-     int NumRegs = 0;
--    sycl::detail::ur::assertion(
--        cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
--                           hKernel->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuFuncGetAttribute(&NumRegs,
-+                                             CU_FUNC_ATTRIBUTE_NUM_REGS,
-+                                             hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<uint32_t>(NumRegs));
-   }
-   default:
-@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
-     // Sub-group size is equivalent to warp size
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<uint32_t>(WarpSize));
-   }
-   case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
-     // Number of sub-groups = max block size / warp size + possible remainder
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                            hKernel->get()) == CUDA_SUCCESS);
-     int WarpSize = 0;
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-index b19acea3159f..f0c276579476 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
-     // error for which it is unclear if the function that reported it succeeded
-     // or not. Either way, the state of the program is compromised and likely
-     // unrecoverable.
--    sycl::detail::ur::die(
--        "Unrecoverable program state reached in urMemRelease");
-+    detail::ur::die("Unrecoverable program state reached in urMemRelease");
-   }
- 
-   return UR_RESULT_SUCCESS;
-@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
-     PixelTypeSizeBytes = 4;
-     break;
-   default:
--    sycl::detail::ur::die(
-+    detail::ur::die(
-         "urMemImageCreate given unsupported image_channel_data_type");
-   }
- 
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-index 05443eeed89d..32391fec5c13 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
-   else if (CuFlags == CU_STREAM_NON_BLOCKING)
-     Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM;
-   else
--    sycl::detail::ur::die("Unknown cuda stream");
-+    detail::ur::die("Unknown cuda stream");
- 
-   std::vector<CUstream> ComputeCuStreams(1, CuStream);
-   std::vector<CUstream> TransferCuStreams(0);
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-index 36ec89fb9da3..836e47f988e5 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) {
- 
-   // double delete or someone is messing with the ref count.
-   // either way, cannot safely proceed.
--  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       hSampler->getReferenceCount() != 0,
-       "Reference count overflow detected in urSamplerRelease.");
- 
--- 
-2.41.0
-
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 57df48f95f..aa09004667 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -9,15 +9,7 @@ add_subdirectory(null)
 if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
-
-    get_program_version_major_minor(git GIT_VERSION)
-    set(GIT_QUIET_OPTION "")
-    if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0")
-        set(GIT_QUIET_OPTION "--quiet")
-    endif()
-    execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
-        WORKING_DIRECTORY ${SYCL_ADAPTER_DIR})
+    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230717 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
 endif()
 
 if(UR_BUILD_ADAPTER_L0)
diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
index 82ab485119..7db8bbf3fc 100644
--- a/source/adapters/hip/CMakeLists.txt
+++ b/source/adapters/hip/CMakeLists.txt
@@ -75,6 +75,12 @@ add_library(${TARGET_NAME}
     ${HIP_DIR}/../../usm_allocator_config.hpp
 )
 
+if (NOT MSVC)
+    target_compile_options(${TARGET_NAME} PRIVATE
+        -Wno-deprecated-declarations
+    )
+endif()
+
 set_target_properties(${TARGET_NAME} PROPERTIES
     VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
     SOVERSION "${PROJECT_VERSION_MAJOR}"

From e97c22d1ed6fc1905a1f44dfce415729634cf16a Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Tue, 18 Jul 2023 14:14:31 +0200
Subject: [PATCH 11/16] [adapters] only export get proc table symbols in
 adapters

This adds a version script to restrict visiblity of symbols
in adapters. This is so that adapters don't use loader ur
functions when populating proc tables...
---
 source/adapters/CMakeLists.txt            | 24 ++++++++++++++++++++++-
 source/adapters/adapter.def.in            | 20 +++++++++++++++++++
 source/adapters/adapter.map.in            | 23 ++++++++++++++++++++++
 source/adapters/cuda/CMakeLists.txt       |  2 +-
 source/adapters/hip/CMakeLists.txt        |  4 ++--
 source/adapters/level_zero/CMakeLists.txt |  8 +-------
 source/adapters/null/CMakeLists.txt       |  7 +------
 7 files changed, 71 insertions(+), 17 deletions(-)
 create mode 100644 source/adapters/adapter.def.in
 create mode 100644 source/adapters/adapter.map.in

diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index aa09004667..8a885923c0 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -3,8 +3,30 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-add_subdirectory(null)
+function(add_ur_adapter name)
+    add_library(${name} ${ARGN})
+    if(MSVC)
+        set(TARGET_LIBNAME ${name})
+        string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
+
+        set(ADAPTER_VERSION_SCRIPT ${name}.def)
+
+        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.def.in ${ADAPTER_VERSION_SCRIPT} @ONLY)
+        set_target_properties(${name} PROPERTIES
+            LINK_FLAGS "/DEF:${ADAPTER_VERSION_SCRIPT}"
+        )
+    else()
+        set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0)
+        string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
 
+        set(ADAPTER_VERSION_SCRIPT ${name}.map)
+
+        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.map.in ${ADAPTER_VERSION_SCRIPT} @ONLY)
+        target_link_options(${name} PRIVATE "-Wl,--version-script=${ADAPTER_VERSION_SCRIPT}")
+    endif()
+endfunction()
+
+add_subdirectory(null)
 
 if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP)
     # fetch adapter sources from SYCL
diff --git a/source/adapters/adapter.def.in b/source/adapters/adapter.def.in
new file mode 100644
index 0000000000..bfe14a6a03
--- /dev/null
+++ b/source/adapters/adapter.def.in
@@ -0,0 +1,20 @@
+LIBRARY @TARGET_LIBNAME@
+EXPORTS
+	urGetBindlessImagesExpProcAddrTable
+	urGetCommandBufferExpProcAddrTable
+	urGetContextProcAddrTable
+	urGetDeviceProcAddrTable
+	urGetEnqueueProcAddrTable
+	urGetEventProcAddrTable
+	urGetGlobalProcAddrTable
+	urGetKernelProcAddrTable
+	urGetMemProcAddrTable
+	urGetPhysicalMemProcAddrTable
+	urGetPlatformProcAddrTable
+	urGetProgramProcAddrTable
+	urGetQueueProcAddrTable
+	urGetSamplerProcAddrTable
+	urGetUSMExpProcAddrTable
+	urGetUsmP2PExpProcAddrTable
+	urGetUSMProcAddrTable
+	urGetVirtualMemProcAddrTable
diff --git a/source/adapters/adapter.map.in b/source/adapters/adapter.map.in
new file mode 100644
index 0000000000..cbb5c6c4cb
--- /dev/null
+++ b/source/adapters/adapter.map.in
@@ -0,0 +1,23 @@
+@TARGET_LIBNAME@ {
+	global:
+		urGetBindlessImagesExpProcAddrTable;
+		urGetCommandBufferExpProcAddrTable;
+		urGetContextProcAddrTable;
+		urGetDeviceProcAddrTable;
+		urGetEnqueueProcAddrTable;
+		urGetEventProcAddrTable;
+		urGetGlobalProcAddrTable;
+		urGetKernelProcAddrTable;
+		urGetMemProcAddrTable;
+		urGetPhysicalMemProcAddrTable;
+		urGetPlatformProcAddrTable;
+		urGetProgramProcAddrTable;
+		urGetQueueProcAddrTable;
+		urGetSamplerProcAddrTable;
+		urGetUSMExpProcAddrTable;
+		urGetUsmP2PExpProcAddrTable;
+		urGetUSMProcAddrTable;
+		urGetVirtualMemProcAddrTable;
+	local:
+		*;
+};
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index d0bc3fd6b2..29bdc87152 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda"
 
 set(TARGET_NAME ur_adapter_cuda)
 
-add_library(${TARGET_NAME}
+add_ur_adapter(${TARGET_NAME}
     SHARED
     ${CUDA_DIR}/ur_interface_loader.cpp
     ${CUDA_DIR}/common.hpp
diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
index 7db8bbf3fc..a014725268 100644
--- a/source/adapters/hip/CMakeLists.txt
+++ b/source/adapters/hip/CMakeLists.txt
@@ -42,7 +42,7 @@ endif()
 # Set includes used in added library (rocmdrv)
 set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}")
 
-add_library(${TARGET_NAME}
+add_ur_adapter(${TARGET_NAME}
     SHARED
     ${HIP_DIR}/ur_interface_loader.cpp
     ${HIP_DIR}/common.hpp
@@ -75,7 +75,7 @@ add_library(${TARGET_NAME}
     ${HIP_DIR}/../../usm_allocator_config.hpp
 )
 
-if (NOT MSVC)
+if(NOT MSVC)
     target_compile_options(${TARGET_NAME} PRIVATE
         -Wno-deprecated-declarations
     )
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 52d22c3ea6..fa089c4b9a 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -71,7 +71,7 @@ target_include_directories(LevelZeroLoader-Headers
     INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}"
 )
 
-add_library(${TARGET_NAME}
+add_ur_adapter(${TARGET_NAME}
     SHARED
     ${L0_DIR}/ur_interface_loader.cpp
     ${L0_DIR}/common.hpp
@@ -120,9 +120,3 @@ target_include_directories(${TARGET_NAME} PRIVATE
     ${L0_DIR}/../../../
     LevelZeroLoader-Headers
 )
-
-if(UNIX)
-    set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing")
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}")
-endif()
-
diff --git a/source/adapters/null/CMakeLists.txt b/source/adapters/null/CMakeLists.txt
index 56b8815929..0d4aa13e01 100644
--- a/source/adapters/null/CMakeLists.txt
+++ b/source/adapters/null/CMakeLists.txt
@@ -5,7 +5,7 @@
 
 set(TARGET_NAME ur_adapter_null)
 
-add_library(${TARGET_NAME}
+add_ur_adapter(${TARGET_NAME}
     SHARED
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_null.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_null.cpp
@@ -21,8 +21,3 @@ target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::headers
     ${PROJECT_NAME}::common
 )
-
-if(UNIX)
-    set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing")
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}")
-endif()

From 189588362c2420dcc828804e53b9789b32ec2741 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Fri, 28 Jul 2023 16:31:54 +0100
Subject: [PATCH 12/16] [ur] Fix adapters linker script path on Linux

Use an absolute path instead of just the filename for the
`ur_adapter_<name>.map` files to resolve `/usr/bin/ld: cannot open
linker script file` errors.
---
 source/adapters/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 8a885923c0..c003671e0c 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -19,7 +19,7 @@ function(add_ur_adapter name)
         set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0)
         string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
 
-        set(ADAPTER_VERSION_SCRIPT ${name}.map)
+        set(ADAPTER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/${name}.map)
 
         configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.map.in ${ADAPTER_VERSION_SCRIPT} @ONLY)
         target_link_options(${name} PRIVATE "-Wl,--version-script=${ADAPTER_VERSION_SCRIPT}")

From fcded3d087c1218b1172dad74f3cb976005b2f4d Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Fri, 14 Jul 2023 13:06:41 +0100
Subject: [PATCH 13/16] [UR] Implement CUDA native handle tests

---
 test/conformance/CMakeLists.txt               |  1 +
 test/conformance/adapters/CMakeLists.txt      |  3 ++
 test/conformance/adapters/cuda/CMakeLists.txt | 15 ++++++++
 .../conformance/adapters/cuda/cuda_fixtures.h | 38 +++++++++++++++++++
 .../cuda/cuda_urContextGetNativeHandle.cpp    | 13 +++++++
 .../cuda_urDeviceCreateWithNativeHandle.cpp   | 18 +++++++++
 .../cuda/cuda_urDeviceGetNativeHandle.cpp     | 15 ++++++++
 .../cuda_urEventCreateWithNativeHandle.cpp    | 19 ++++++++++
 .../cuda/cuda_urEventGetNativeHandle.cpp      | 25 ++++++++++++
 9 files changed, 147 insertions(+)
 create mode 100644 test/conformance/adapters/CMakeLists.txt
 create mode 100644 test/conformance/adapters/cuda/CMakeLists.txt
 create mode 100644 test/conformance/adapters/cuda/cuda_fixtures.h
 create mode 100644 test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
 create mode 100644 test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
 create mode 100644 test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
 create mode 100644 test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
 create mode 100644 test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp

diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index 5c14d3e34a..fe6eb43f9b 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -45,6 +45,7 @@ function(add_conformance_test_with_platform_environment name)
 endfunction()
 
 add_subdirectory(testing)
+add_subdirectory(adapters)
 
 add_subdirectory(platform)
 add_subdirectory(device)
diff --git a/test/conformance/adapters/CMakeLists.txt b/test/conformance/adapters/CMakeLists.txt
new file mode 100644
index 0000000000..c49bdfd412
--- /dev/null
+++ b/test/conformance/adapters/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(UR_BUILD_ADAPTER_CUDA)
+    add_subdirectory(cuda)
+endif()
diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt
new file mode 100644
index 0000000000..bed37d8070
--- /dev/null
+++ b/test/conformance/adapters/cuda/CMakeLists.txt
@@ -0,0 +1,15 @@
+
+add_conformance_test_with_devices_environment(adapter-cuda 
+    cuda_fixtures.h
+    cuda_urContextGetNativeHandle.cpp
+    cuda_urDeviceGetNativeHandle.cpp
+    cuda_urDeviceCreateWithNativeHandle.cpp
+    cuda_urEventGetNativeHandle.cpp
+    cuda_urEventCreateWithNativeHandle.cpp
+)
+target_link_libraries(test-adapter-cuda PRIVATE cudadrv)
+
+set_tests_properties(adapter-cuda PROPERTIES
+        LABELS "conformance:cuda"
+        ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_cuda>\""
+    )
diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h
new file mode 100644
index 0000000000..2624abc434
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_fixtures.h
@@ -0,0 +1,38 @@
+#ifndef UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
+#define UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
+#include <cuda.h>
+#include <uur/fixtures.h>
+
+namespace uur {
+struct ResultCuda {
+
+    constexpr ResultCuda(CUresult result) noexcept : value(result) {}
+
+    inline bool operator==(const ResultCuda &rhs) const noexcept {
+        return rhs.value == value;
+    }
+
+    CUresult value;
+};
+} // namespace uur
+
+#ifndef ASSERT_EQ_RESULT_CUDA
+#define ASSERT_EQ_RESULT_CUDA(EXPECTED, ACTUAL)                                \
+    ASSERT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
+#endif // ASSERT_EQ_RESULT_CUDA
+
+#ifndef ASSERT_SUCCESS_CUDA
+#define ASSERT_SUCCESS_CUDA(ACTUAL) ASSERT_EQ_RESULT_CUDA(CUDA_SUCCESS, ACTUAL)
+#endif // ASSERT_SUCCESS_CUDA
+
+#ifndef EXPECT_EQ_RESULT_CUDA
+#define EXPECT_EQ_RESULT_CUDA(EXPECTED, ACTUAL)                                \
+    EXPECT_EQ_RESULT_CUDA(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
+#endif // EXPECT_EQ_RESULT_CUDA
+
+#ifndef EXPECT_SUCCESS_CUDA
+#define EXPECT_SUCCESS_CUDA(ACTUAL)                                            \
+    EXPECT_EQ_RESULT_CUDA(UR_RESULT_SUCCESS, ACTUAL)
+#endif // EXPECT_EQ_RESULT_CUDA
+
+#endif // UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
diff --git a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
new file mode 100644
index 0000000000..b9d199516d
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
@@ -0,0 +1,13 @@
+#include "cuda_fixtures.h"
+
+using urCudaContextGetNativeHandle = uur::urContextTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaContextGetNativeHandle);
+
+TEST_P(urCudaContextGetNativeHandle, Success) {
+    ur_native_handle_t native_context = nullptr;
+    ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
+    CUcontext cuda_context = reinterpret_cast<CUcontext>(native_context);
+
+    unsigned int cudaVersion;
+    ASSERT_SUCCESS_CUDA(cuCtxGetApiVersion(cuda_context, &cudaVersion));
+}
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
new file mode 100644
index 0000000000..89fddfaf83
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
@@ -0,0 +1,18 @@
+#include "cuda_fixtures.h"
+
+using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest;
+
+TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
+    // get a device from cuda
+    int nCudaDevices;
+    ASSERT_SUCCESS_CUDA(cuDeviceGetCount(&nCudaDevices));
+    ASSERT_GT(nCudaDevices, 0);
+    CUdevice cudaDevice;
+    ASSERT_SUCCESS_CUDA(cuDeviceGet(&cudaDevice, 0));
+
+    ur_native_handle_t nativeCuda =
+        reinterpret_cast<ur_native_handle_t>(cudaDevice);
+    ur_device_handle_t urDevice;
+    ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr,
+                                                  &urDevice));
+}
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
new file mode 100644
index 0000000000..cfedae68ad
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
@@ -0,0 +1,15 @@
+#include "cuda_fixtures.h"
+
+using urCudaGetDeviceNativeHandle = uur::urDeviceTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaGetDeviceNativeHandle);
+
+TEST_P(urCudaGetDeviceNativeHandle, Success) {
+    ur_native_handle_t native_handle;
+    ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle));
+
+    CUdevice cuda_device = *reinterpret_cast<CUdevice *>(&native_handle);
+
+    char cuda_device_name[256];
+    ASSERT_SUCCESS_CUDA(cuDeviceGetName(cuda_device_name,
+                                        sizeof(cuda_device_name), cuda_device));
+}
diff --git a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
new file mode 100644
index 0000000000..d85b83902f
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
@@ -0,0 +1,19 @@
+#include "cuda_fixtures.h"
+
+using urCudaEventCreateWithNativeHandleTest = uur::urQueueTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEventCreateWithNativeHandleTest);
+
+TEST_P(urCudaEventCreateWithNativeHandleTest, Success) {
+
+    CUevent cuda_event;
+    ASSERT_SUCCESS_CUDA(cuEventCreate(&cuda_event, CU_EVENT_DEFAULT));
+
+    ur_native_handle_t native_event =
+        reinterpret_cast<ur_native_handle_t>(cuda_event);
+
+    ur_event_handle_t event = nullptr;
+    ASSERT_SUCCESS(
+        urEventCreateWithNativeHandle(native_event, context, nullptr, &event));
+
+    ASSERT_SUCCESS(urEventRelease(event));
+}
diff --git a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp
new file mode 100644
index 0000000000..484b2e88c6
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp
@@ -0,0 +1,25 @@
+#include "cuda_fixtures.h"
+
+using urCudaEventGetNativeHandleTest = uur::urQueueTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEventGetNativeHandleTest);
+
+TEST_P(urCudaEventGetNativeHandleTest, Success) {
+    constexpr size_t buffer_size = 1024;
+    ur_mem_handle_t mem = nullptr;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE,
+                                     buffer_size, nullptr, &mem));
+
+    ur_event_handle_t event = nullptr;
+    uint8_t pattern = 6;
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem, &pattern, sizeof(pattern),
+                                          0, buffer_size, 0, nullptr, &event));
+
+    ur_native_handle_t native_event = nullptr;
+    ASSERT_SUCCESS(urEventGetNativeHandle(event, &native_event));
+    CUevent cuda_event = reinterpret_cast<CUevent>(native_event);
+
+    ASSERT_SUCCESS_CUDA(cuEventSynchronize(cuda_event));
+
+    ASSERT_SUCCESS(urEventRelease(event));
+    ASSERT_SUCCESS(urMemRelease(mem));
+}

From ca91dc97611bc8abe542e681539be36eb07f854c Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Tue, 1 Aug 2023 12:52:48 +0100
Subject: [PATCH 14/16] [UR] Add license to missing files

---
 test/conformance/adapters/CMakeLists.txt                     | 5 +++++
 test/conformance/adapters/cuda/CMakeLists.txt                | 4 ++++
 test/conformance/adapters/cuda/cuda_fixtures.h               | 5 +++++
 .../adapters/cuda/cuda_urContextGetNativeHandle.cpp          | 5 +++++
 .../adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp    | 5 +++++
 .../adapters/cuda/cuda_urDeviceGetNativeHandle.cpp           | 5 +++++
 .../adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp     | 5 +++++
 .../adapters/cuda/cuda_urEventGetNativeHandle.cpp            | 5 +++++
 8 files changed, 39 insertions(+)

diff --git a/test/conformance/adapters/CMakeLists.txt b/test/conformance/adapters/CMakeLists.txt
index c49bdfd412..7b9324d5c5 100644
--- a/test/conformance/adapters/CMakeLists.txt
+++ b/test/conformance/adapters/CMakeLists.txt
@@ -1,3 +1,8 @@
+# Copyright (C) 2023 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 if(UR_BUILD_ADAPTER_CUDA)
     add_subdirectory(cuda)
 endif()
diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt
index bed37d8070..241eb87a8c 100644
--- a/test/conformance/adapters/cuda/CMakeLists.txt
+++ b/test/conformance/adapters/cuda/CMakeLists.txt
@@ -1,3 +1,7 @@
+# Copyright (C) 2023 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 add_conformance_test_with_devices_environment(adapter-cuda 
     cuda_fixtures.h
diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h
index 2624abc434..e367a4aa2c 100644
--- a/test/conformance/adapters/cuda/cuda_fixtures.h
+++ b/test/conformance/adapters/cuda/cuda_fixtures.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #ifndef UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
 #define UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
 #include <cuda.h>
diff --git a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
index b9d199516d..0a2c855360 100644
--- a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #include "cuda_fixtures.h"
 
 using urCudaContextGetNativeHandle = uur::urContextTest;
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
index 89fddfaf83..b116c9a5c9 100644
--- a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #include "cuda_fixtures.h"
 
 using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest;
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
index cfedae68ad..3d2cfd33b7 100644
--- a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #include "cuda_fixtures.h"
 
 using urCudaGetDeviceNativeHandle = uur::urDeviceTest;
diff --git a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
index d85b83902f..94ae9ad80b 100644
--- a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #include "cuda_fixtures.h"
 
 using urCudaEventCreateWithNativeHandleTest = uur::urQueueTest;
diff --git a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp
index 484b2e88c6..a6185868fb 100644
--- a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #include "cuda_fixtures.h"
 
 using urCudaEventGetNativeHandleTest = uur::urQueueTest;

From 759e7535732df8f19a3134fc77178a90adf11fdf Mon Sep 17 00:00:00 2001
From: pbalcer <piotr.balcer@intel.com>
Date: Tue, 1 Aug 2023 12:11:49 +0200
Subject: [PATCH 15/16] [ADAPTERS] update to latest from sycl

---
 CMakeLists.txt                                         |  2 +-
 source/adapters/CMakeLists.txt                         |  6 ++++--
 source/adapters/cuda/CMakeLists.txt                    | 10 ++++++----
 source/adapters/hip/CMakeLists.txt                     | 10 ++++++----
 source/adapters/level_zero/CMakeLists.txt              | 10 ++++++----
 .../src/memory_provider_get_last_failed.cpp            |  2 +-
 .../src/memory_provider_internal.h                     |  2 +-
 7 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1107f0ac3e..9971d3f40d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,7 +91,7 @@ if(UR_ENABLE_TRACING)
     set(XPTI_DIR ${xpti_SOURCE_DIR})
     set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests")
 
-    FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230304" "xptifw")
+    FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw")
 
     FetchContent_MakeAvailable(xptifw)
 
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index c003671e0c..8de581ac68 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 function(add_ur_adapter name)
-    add_library(${name} ${ARGN})
+    add_ur_library(${name} ${ARGN})
     if(MSVC)
         set(TARGET_LIBNAME ${name})
         string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
@@ -15,6 +15,8 @@ function(add_ur_adapter name)
         set_target_properties(${name} PROPERTIES
             LINK_FLAGS "/DEF:${ADAPTER_VERSION_SCRIPT}"
         )
+    elseif(APPLE)
+        target_compile_options(${name} PRIVATE "-fvisibility=hidden")
     else()
         set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0)
         string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
@@ -31,7 +33,7 @@ add_subdirectory(null)
 if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230717 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
+    FetchSource(https://github.com/intel/llvm.git nightly-2023-08-01 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
 endif()
 
 if(UR_BUILD_ADAPTER_L0)
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index 29bdc87152..f85d759c09 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -10,6 +10,10 @@ set(TARGET_NAME ur_adapter_cuda)
 add_ur_adapter(${TARGET_NAME}
     SHARED
     ${CUDA_DIR}/ur_interface_loader.cpp
+    ${CUDA_DIR}/adapter.hpp
+    ${CUDA_DIR}/adapter.cpp
+    ${CUDA_DIR}/command_buffer.hpp
+    ${CUDA_DIR}/command_buffer.cpp
     ${CUDA_DIR}/common.hpp
     ${CUDA_DIR}/common.cpp
     ${CUDA_DIR}/context.hpp
@@ -33,12 +37,9 @@ add_ur_adapter(${TARGET_NAME}
     ${CUDA_DIR}/sampler.cpp
     ${CUDA_DIR}/tracing.cpp
     ${CUDA_DIR}/usm.cpp
+    ${CUDA_DIR}/usm_p2p.cpp
     ${CUDA_DIR}/../../ur.cpp
     ${CUDA_DIR}/../../ur.hpp
-    ${CUDA_DIR}/../../usm_allocator.cpp
-    ${CUDA_DIR}/../../usm_allocator.hpp
-    ${CUDA_DIR}/../../usm_allocator_config.cpp
-    ${CUDA_DIR}/../../usm_allocator_config.hpp
 )
 
 set_target_properties(${TARGET_NAME} PROPERTIES
@@ -69,6 +70,7 @@ endif()
 target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::headers
     ${PROJECT_NAME}::common
+    ${PROJECT_NAME}::unified_malloc_framework
     Threads::Threads
     cudadrv
 )
diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
index a014725268..2f205d84e6 100644
--- a/source/adapters/hip/CMakeLists.txt
+++ b/source/adapters/hip/CMakeLists.txt
@@ -45,6 +45,10 @@ set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}")
 add_ur_adapter(${TARGET_NAME}
     SHARED
     ${HIP_DIR}/ur_interface_loader.cpp
+    ${HIP_DIR}/adapter.hpp
+    ${HIP_DIR}/adapter.cpp
+    ${HIP_DIR}/command_buffer.hpp
+    ${HIP_DIR}/command_buffer.cpp
     ${HIP_DIR}/common.hpp
     ${HIP_DIR}/common.cpp
     ${HIP_DIR}/context.hpp
@@ -67,12 +71,9 @@ add_ur_adapter(${TARGET_NAME}
     ${HIP_DIR}/sampler.hpp
     ${HIP_DIR}/sampler.cpp
     ${HIP_DIR}/usm.cpp
+    ${HIP_DIR}/usm_p2p.cpp
     ${HIP_DIR}/../../ur.cpp
     ${HIP_DIR}/../../ur.hpp
-    ${HIP_DIR}/../../usm_allocator.cpp
-    ${HIP_DIR}/../../usm_allocator.hpp
-    ${HIP_DIR}/../../usm_allocator_config.cpp
-    ${HIP_DIR}/../../usm_allocator_config.hpp
 )
 
 if(NOT MSVC)
@@ -100,6 +101,7 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
     target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::headers
     ${PROJECT_NAME}::common
+    ${PROJECT_NAME}::unified_malloc_framework
     rocmdrv
     )
 
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index fa089c4b9a..c361c230d3 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -74,6 +74,10 @@ target_include_directories(LevelZeroLoader-Headers
 add_ur_adapter(${TARGET_NAME}
     SHARED
     ${L0_DIR}/ur_interface_loader.cpp
+    ${L0_DIR}/adapter.hpp
+    ${L0_DIR}/adapter.cpp
+    ${L0_DIR}/command_buffer.hpp
+    ${L0_DIR}/command_buffer.cpp
     ${L0_DIR}/common.hpp
     ${L0_DIR}/context.hpp
     ${L0_DIR}/device.hpp
@@ -91,6 +95,7 @@ add_ur_adapter(${TARGET_NAME}
     ${L0_DIR}/device.cpp
     ${L0_DIR}/event.cpp
     ${L0_DIR}/usm.cpp
+    ${L0_DIR}/usm_p2p.cpp
     ${L0_DIR}/memory.cpp
     ${L0_DIR}/kernel.cpp
     ${L0_DIR}/platform.cpp
@@ -98,10 +103,6 @@ add_ur_adapter(${TARGET_NAME}
     ${L0_DIR}/queue.cpp
     ${L0_DIR}/sampler.cpp
     ${L0_DIR}/../../ur.cpp
-    ${L0_DIR}/../../usm_allocator.cpp
-    ${L0_DIR}/../../usm_allocator.hpp
-    ${L0_DIR}/../../usm_allocator_config.cpp
-    ${L0_DIR}/../../usm_allocator_config.hpp
 )
 
 set_target_properties(${TARGET_NAME} PROPERTIES
@@ -112,6 +113,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
 target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::headers
     ${PROJECT_NAME}::common
+    ${PROJECT_NAME}::unified_malloc_framework
     LevelZeroLoader
     LevelZeroLoader-Headers
 )
diff --git a/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp b/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp
index c439213a26..f9af93206a 100644
--- a/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp
+++ b/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp
@@ -14,7 +14,7 @@ extern "C" {
 
 static thread_local umf_memory_provider_handle_t lastFailedProvider = nullptr;
 
-umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr() {
+umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr(void) {
     return &lastFailedProvider;
 }
 }
diff --git a/source/common/unified_malloc_framework/src/memory_provider_internal.h b/source/common/unified_malloc_framework/src/memory_provider_internal.h
index 07befd4b4e..2bad161706 100644
--- a/source/common/unified_malloc_framework/src/memory_provider_internal.h
+++ b/source/common/unified_malloc_framework/src/memory_provider_internal.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 void *umfMemoryProviderGetPriv(umf_memory_provider_handle_t hProvider);
-umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr();
+umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr(void);
 
 #ifdef __cplusplus
 }

From 1984c1ddff5dd6b7d8a01348aa98eca500986b59 Mon Sep 17 00:00:00 2001
From: Patryk Kaminski <patryk.kaminski@intel.com>
Date: Fri, 30 Jun 2023 16:21:32 +0200
Subject: [PATCH 16/16] Test parts of API with libFuzzer.

Co-authored-by: omar.ahmed@codeplay.com
---
 .github/workflows/cmake.yml      |   7 +
 source/adapters/null/ur_null.cpp |  63 +++++
 test/CMakeLists.txt              |   3 +
 test/conformance/CMakeLists.txt  |   2 +-
 test/fuzz/CMakeLists.txt         |  35 +++
 test/fuzz/urFuzz.cpp             | 397 +++++++++++++++++++++++++++++++
 test/fuzz/utils.hpp              | 199 ++++++++++++++++
 7 files changed, 705 insertions(+), 1 deletion(-)
 create mode 100644 test/fuzz/CMakeLists.txt
 create mode 100644 test/fuzz/urFuzz.cpp
 create mode 100644 test/fuzz/utils.hpp

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 306552a110..a703fd8d84 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -17,6 +17,7 @@ jobs:
             build_type: Release
             compiler: {c: clang, cxx: clang++}
             libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF'
+            fuzztest: ON
           - os: 'ubuntu-22.04'
             build_type: Release
             compiler: {c: gcc, cxx: g++}
@@ -25,6 +26,7 @@ jobs:
             build_type: Release
             compiler: {c: clang, cxx: clang++}
             libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON'
+            fuzztest: ON
           - os: 'ubuntu-20.04'
             build_type: Release
             compiler: {c: gcc-7, cxx: g++-7}
@@ -88,6 +90,11 @@ jobs:
       working-directory: ${{github.workspace}}/build
       run: ctest -C ${{matrix.build_type}} --output-on-failure -L "python|umf|loader|validation|tracing|unit|urtrace"
 
+    - name: Fuzz test
+      working-directory: ${{github.workspace}}/build
+      if: matrix.fuzztest == 'ON'
+      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz"
+
   adapter-build:
     name: Build - Adapters on Ubuntu
     strategy:
diff --git a/source/adapters/null/ur_null.cpp b/source/adapters/null/ur_null.cpp
index 18c8d89ef5..5a62761b67 100644
--- a/source/adapters/null/ur_null.cpp
+++ b/source/adapters/null/ur_null.cpp
@@ -163,5 +163,68 @@ context_t::context_t() {
             }
             return UR_RESULT_SUCCESS;
         };
+
+    //////////////////////////////////////////////////////////////////////////
+    urDdiTable.USM.pfnHostAlloc =
+        [](ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
+           ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
+            if (size == 0) {
+                *ppMem = nullptr;
+                return UR_RESULT_ERROR_UNSUPPORTED_SIZE;
+            }
+            *ppMem = malloc(size);
+            if (ppMem == nullptr) {
+                return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+            }
+            return UR_RESULT_SUCCESS;
+        };
+
+    //////////////////////////////////////////////////////////////////////////
+    urDdiTable.USM.pfnDeviceAlloc =
+        [](ur_context_handle_t hContext, ur_device_handle_t hDevice,
+           const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
+           size_t size, void **ppMem) {
+            if (size == 0) {
+                *ppMem = nullptr;
+                return UR_RESULT_ERROR_UNSUPPORTED_SIZE;
+            }
+            *ppMem = malloc(size);
+            if (ppMem == nullptr) {
+                return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+            }
+            return UR_RESULT_SUCCESS;
+        };
+
+    //////////////////////////////////////////////////////////////////////////
+    urDdiTable.USM.pfnFree = [](ur_context_handle_t hContext, void *pMem) {
+        free(pMem);
+        return UR_RESULT_SUCCESS;
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    urDdiTable.USM.pfnGetMemAllocInfo =
+        [](ur_context_handle_t hContext, const void *pMem,
+           ur_usm_alloc_info_t propName, size_t propSize, void *pPropValue,
+           size_t *pPropSizeRet) {
+            switch (propName) {
+            case UR_USM_ALLOC_INFO_TYPE:
+                *reinterpret_cast<ur_usm_type_t *>(pPropValue) =
+                    pMem ? UR_USM_TYPE_DEVICE : UR_USM_TYPE_UNKNOWN;
+                if (pPropSizeRet != nullptr) {
+                    *pPropSizeRet = sizeof(ur_usm_type_t);
+                }
+                break;
+            case UR_USM_ALLOC_INFO_SIZE:
+                *reinterpret_cast<size_t *>(pPropValue) = pMem ? SIZE_MAX : 0;
+                if (pPropSizeRet != nullptr) {
+                    *pPropSizeRet = sizeof(size_t);
+                }
+                break;
+            default:
+                pPropValue = nullptr;
+                break;
+            }
+            return UR_RESULT_SUCCESS;
+        };
 }
 } // namespace driver
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 79ca48236c..72564667ed 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,3 +24,6 @@ add_subdirectory(unit)
 if(UR_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    add_subdirectory(fuzz)
+endif()
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index c078168cf9..eec9f37215 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -62,7 +62,7 @@ if(DEFINED UR_DPCXX)
     add_custom_target(generate_device_binaries)
 
     set(UR_CONFORMANCE_DEVICE_BINARIES_DIR
-        "${CMAKE_CURRENT_BINARY_DIR}/device_binaries/")
+        "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL UR_CONFORMANCE_DEVICE_BINARIES_DIR)
     file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR})
 
     if(DEFINED UR_CONFORMANCE_TARGET_TRIPLES)
diff --git a/test/fuzz/CMakeLists.txt b/test/fuzz/CMakeLists.txt
new file mode 100644
index 0000000000..8444e67c64
--- /dev/null
+++ b/test/fuzz/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Copyright (C) 2023 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+function(add_fuzz_test name)
+    set(TEST_TARGET_NAME fuzztest-${name})
+    add_ur_executable(${TEST_TARGET_NAME}
+        ${ARGN})
+    target_link_libraries(${TEST_TARGET_NAME}
+        PRIVATE
+        ${PROJECT_NAME}::loader
+        ${PROJECT_NAME}::headers
+        ${PROJECT_NAME}::common
+        -fsanitize=fuzzer -fprofile-instr-generate -fcoverage-mapping)
+    add_test(NAME ${TEST_TARGET_NAME}
+        COMMAND ${TEST_TARGET_NAME} -max_total_time=600 -seed=1 -shrink=1 -verbosity=1
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    set_tests_properties(${TEST_TARGET_NAME} PROPERTIES
+        LABELS "fuzz"
+        ENVIRONMENT
+            "XPTI_TRACE_ENABLE=1"
+            "XPTI_FRAMEWORK_DISPATCHER=$<TARGET_FILE:xptifw>"
+            "XPTI_SUBSCRIBERS=$<TARGET_FILE:collector>"
+            "UR_ENABLE_LAYERS=UR_LAYER_TRACING"
+            "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_null>\"")
+    target_compile_options(${TEST_TARGET_NAME} PRIVATE -g -fsanitize=fuzzer -fprofile-instr-generate -fcoverage-mapping)
+    target_compile_definitions(${TEST_TARGET_NAME} PRIVATE -DKERNEL_IL_PATH="${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/bar/sycl_spir641.spv")
+    target_include_directories(${TEST_TARGET_NAME} PRIVATE ${UR_CONFORMANCE_DEVICE_BINARIES_DIR})
+
+    add_dependencies(${TEST_TARGET_NAME} generate_device_binaries)
+endfunction()
+
+add_fuzz_test(base
+    urFuzz.cpp)
diff --git a/test/fuzz/urFuzz.cpp b/test/fuzz/urFuzz.cpp
new file mode 100644
index 0000000000..f88b496051
--- /dev/null
+++ b/test/fuzz/urFuzz.cpp
@@ -0,0 +1,397 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "kernel_entry_points.h"
+#include "ur_api.h"
+#include "utils.hpp"
+
+namespace fuzz {
+
+int ur_platform_get(TestState &state) {
+    ur_result_t res = urPlatformGet(
+        state.adapters.data(), state.adapters.size(), state.num_entries,
+        state.platforms.data(), &state.num_platforms);
+    if (res != UR_RESULT_SUCCESS) {
+        return -1;
+    }
+    if (state.platforms.size() != state.num_platforms) {
+        state.platforms.resize(state.num_platforms);
+    }
+
+    return 0;
+}
+
+int ur_device_get(TestState &state) {
+    if (state.platforms.empty() ||
+        state.platform_num >= state.platforms.size() ||
+        state.platforms[0] == nullptr) {
+        return -1;
+    }
+
+    ur_result_t res = urDeviceGet(state.platforms[state.platform_num],
+                                  state.device_type, state.num_entries,
+                                  state.devices.data(), &state.num_devices);
+    if (res != UR_RESULT_SUCCESS) {
+        return -1;
+    }
+    if (state.devices.size() != state.num_devices) {
+        state.devices.resize(state.num_devices);
+    }
+
+    return 0;
+}
+
+int ur_device_release(TestState &state) {
+    if (state.devices.empty()) {
+        return -1;
+    }
+
+    ur_result_t res = urDeviceRelease(state.devices.back());
+    if (res == UR_RESULT_SUCCESS) {
+        state.devices.pop_back();
+    }
+
+    return 0;
+}
+
+int ur_context_create(TestState &state) {
+    if (!check_device_exists(&state)) {
+        return -1;
+    }
+
+    ur_context_handle_t context;
+    ur_result_t res = urContextCreate(state.devices.size(),
+                                      state.devices.data(), nullptr, &context);
+    if (res == UR_RESULT_SUCCESS) {
+        state.contexts.push_back(context);
+    }
+
+    return 0;
+}
+
+int ur_context_release(TestState &state) {
+    if (!check_context_exists(&state) ||
+        state.contexts[state.context_num] == state.contexts.back()) {
+        return -1;
+    }
+
+    ur_result_t res = urContextRelease(state.contexts.back());
+    if (res == UR_RESULT_SUCCESS) {
+        state.contexts.pop_back();
+    }
+
+    return 0;
+}
+
+int pool_create(
+    TestState &state,
+    std::map<ur_usm_pool_handle_t, std::vector<void *>> &pool_allocs) {
+    if (!check_context_exists(&state)) {
+        return -1;
+    }
+
+    ur_usm_pool_handle_t pool;
+    ur_usm_pool_desc_t pool_desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr,
+                                 UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK};
+    ur_result_t res =
+        urUSMPoolCreate(state.contexts[state.context_num], &pool_desc, &pool);
+    if (res == UR_RESULT_SUCCESS) {
+        pool_allocs[pool] = {};
+    }
+
+    return 0;
+}
+
+int ur_usm_pool_create_host(TestState &state) {
+    return pool_create(state, state.pool_host_allocs);
+}
+
+int ur_usm_pool_create_device(TestState &state) {
+    return pool_create(state, state.pool_device_allocs);
+}
+
+int pool_release(
+    TestState &state,
+    std::map<ur_usm_pool_handle_t, std::vector<void *>> &pool_allocs) {
+    if (!check_context_exists(&state)) {
+        return -1;
+    }
+
+    uint8_t pool_num;
+    if (get_next_input_data(&state.input, &pool_num) != 0) {
+        return -1;
+    }
+    if (!check_pool_exists(&pool_allocs, pool_num)) {
+        return -1;
+    }
+
+    auto &[pool, allocs] = *get_map_item(&pool_allocs, pool_num);
+    if (allocs.empty()) {
+        return -1;
+    }
+
+    ur_result_t res = urUSMPoolRelease(pool);
+    if (res == UR_RESULT_SUCCESS) {
+        pool_allocs.erase(pool);
+    }
+
+    return 0;
+}
+
+int ur_usm_pool_release_host(TestState &state) {
+    return pool_release(state, state.pool_host_allocs);
+}
+
+int ur_usm_pool_release_device(TestState &state) {
+    return pool_release(state, state.pool_device_allocs);
+}
+
+int alloc_setup(TestState &state, uint16_t &alloc_size) {
+    if (!check_context_exists(&state)) {
+        return -1;
+    }
+
+    if (get_next_input_data(&state.input, &alloc_size) != 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int host_alloc(TestState &state, ur_usm_pool_handle_t pool,
+               std::vector<void *> &allocs) {
+    void *ptr;
+    uint16_t alloc_size;
+    ur_result_t res = UR_RESULT_SUCCESS;
+
+    int ret = alloc_setup(state, alloc_size);
+    if (ret != 0) {
+        return -1;
+    }
+
+    auto &context = state.contexts[state.context_num];
+    res = urUSMHostAlloc(context, nullptr, pool, alloc_size, &ptr);
+    if (res == UR_RESULT_SUCCESS) {
+        allocs.push_back(ptr);
+    }
+
+    return 0;
+}
+
+int get_alloc_pool(
+    TestState &state,
+    std::map<ur_usm_pool_handle_t, std::vector<void *>> &pool_map,
+    ur_usm_pool_handle_t pool, std::vector<void *> &allocs) {
+    uint8_t pool_num;
+
+    if (get_next_input_data(&state.input, &pool_num) != 0) {
+        return -1;
+    }
+    if (!check_pool_exists(&pool_map, pool_num)) {
+        return -1;
+    }
+
+    auto &[pool_tmp, allocs_tmp] = *get_map_item(&pool_map, pool_num);
+    pool = pool_tmp;
+    allocs = allocs_tmp;
+
+    return 0;
+}
+
+int ur_usm_host_alloc_pool(TestState &state) {
+    ur_usm_pool_handle_t pool = nullptr;
+    std::vector<void *> allocs;
+
+    get_alloc_pool(state, state.pool_host_allocs, pool, allocs);
+    return host_alloc(state, pool, allocs);
+}
+
+int ur_usm_host_alloc_no_pool(TestState &state) {
+    return host_alloc(state, nullptr, state.no_pool_host_allocs);
+}
+
+int device_alloc(TestState &state, ur_usm_pool_handle_t pool,
+                 std::vector<void *> &allocs) {
+    void *ptr;
+    uint16_t alloc_size;
+    ur_result_t res = UR_RESULT_SUCCESS;
+
+    int ret = alloc_setup(state, alloc_size);
+    if (ret != 0) {
+        return -1;
+    }
+
+    if (!check_device_exists(&state)) {
+        return -1;
+    }
+
+    auto &context = state.contexts[state.context_num];
+    auto &device = state.devices[state.device_num];
+    res = urUSMDeviceAlloc(context, device, nullptr, pool, alloc_size, &ptr);
+    if (res == UR_RESULT_SUCCESS) {
+        allocs.push_back(ptr);
+    }
+
+    return 0;
+}
+
+int ur_usm_device_alloc_pool(TestState &state) {
+    ur_usm_pool_handle_t pool = nullptr;
+    std::vector<void *> allocs;
+
+    get_alloc_pool(state, state.pool_device_allocs, pool, allocs);
+    return device_alloc(state, pool, allocs);
+}
+
+int ur_usm_device_alloc_no_pool(TestState &state) {
+    return device_alloc(state, nullptr, state.no_pool_device_allocs);
+}
+
+int free_pool(TestState &state,
+              std::map<ur_usm_pool_handle_t, std::vector<void *>> &pool_map) {
+    if (pool_map.empty()) {
+        return -1;
+    }
+
+    ur_usm_pool_handle_t pool = nullptr;
+    std::vector<void *> allocs;
+
+    int ret = get_alloc_pool(state, pool_map, pool, allocs);
+    if (ret != 0 || allocs.empty()) {
+        return -1;
+    }
+
+    urUSMFree(state.contexts[state.context_num], allocs.back());
+    allocs.pop_back();
+
+    return 0;
+}
+
+int ur_usm_free_host_pool(TestState &state) {
+    return free_pool(state, state.pool_host_allocs);
+}
+
+int ur_usm_free_device_pool(TestState &state) {
+    return free_pool(state, state.pool_device_allocs);
+}
+
+int free_no_pool(TestState &state, std::vector<void *> allocs) {
+    if (allocs.empty()) {
+        return -1;
+    }
+
+    urUSMFree(state.contexts[state.context_num], allocs.back());
+    allocs.pop_back();
+
+    return 0;
+}
+
+int ur_usm_free_host_no_pool(TestState &state) {
+    return free_no_pool(state, state.no_pool_host_allocs);
+}
+
+int ur_usm_free_device_no_pool(TestState &state) {
+    return free_no_pool(state, state.no_pool_device_allocs);
+}
+
+// TODO: Extract API calls to separate functions
+int ur_program_create_with_il(TestState &state) {
+    if (!check_context_exists(&state) || !check_device_exists(&state)) {
+        return -1;
+    }
+
+    std::vector<char> il_bin;
+    ur_program_handle_t program = nullptr;
+    ur_kernel_handle_t kernel = nullptr;
+    ur_queue_handle_t queue = nullptr;
+    ur_event_handle_t event = nullptr;
+    auto &context = state.contexts[state.context_num];
+    auto &device = state.devices[state.device_num];
+    std::string kernel_name =
+        uur::device_binaries::program_kernel_map["bar"][0];
+
+    load_kernel_source(il_bin);
+    urProgramCreateWithIL(context, il_bin.data(), il_bin.size(), nullptr,
+                          &program);
+    urProgramBuild(context, program, nullptr);
+    urKernelCreate(program, kernel_name.data(), &kernel);
+    urQueueCreate(context, device, nullptr, &queue);
+
+    const uint32_t nDim = 3;
+    const size_t gWorkOffset[] = {0, 0, 0};
+    const size_t gWorkSize[] = {128, 128, 128};
+
+    urEnqueueKernelLaunch(queue, kernel, nDim, gWorkOffset, gWorkSize, nullptr,
+                          0, nullptr, &event);
+
+    urEventWait(1, &event);
+    urEventRelease(event);
+    urQueueFinish(queue);
+    urQueueRelease(queue);
+    urKernelRelease(kernel);
+    urProgramRelease(program);
+
+    return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) {
+    int next_api_call;
+    TestState test_state;
+    int ret = -1;
+
+    int (*api_wrappers[])(TestState &) = {
+        ur_platform_get,
+        ur_device_get,
+        ur_device_release,
+        ur_context_create,
+        ur_context_release,
+        ur_usm_pool_create_host,
+        ur_usm_pool_create_device,
+        ur_usm_pool_release_host,
+        ur_usm_pool_release_device,
+        ur_usm_host_alloc_pool,
+        ur_usm_host_alloc_no_pool,
+        ur_usm_device_alloc_pool,
+        ur_usm_device_alloc_no_pool,
+        ur_usm_free_host_pool,
+        ur_usm_free_host_no_pool,
+        ur_usm_free_device_pool,
+        ur_usm_free_device_no_pool,
+        ur_program_create_with_il,
+    };
+
+    test_state.input = {data, size};
+
+    ret = init_random_data(&test_state);
+    if (ret == -1) {
+        return ret;
+    }
+
+    urLoaderConfigCreate(&test_state.config);
+    urLoaderConfigEnableLayer(test_state.config, "UR_LAYER_FULL_VALIDATION");
+    ur_result_t res = urInit(0, test_state.config);
+    if (res != UR_RESULT_SUCCESS) {
+        return -1;
+    }
+
+    test_state.adapters.resize(test_state.num_entries);
+    res = urAdapterGet(test_state.num_entries, test_state.adapters.data(),
+                       &test_state.num_adapters);
+    if (res != UR_RESULT_SUCCESS || test_state.num_adapters == 0) {
+        return -1;
+    }
+
+    while ((next_api_call = get_next_api_call(&test_state.input)) != -1) {
+        ret = api_wrappers[next_api_call](test_state);
+        if (ret) {
+            cleanup(&test_state);
+            return -1;
+        }
+    }
+
+    cleanup(&test_state);
+    return 0;
+}
+} // namespace fuzz
diff --git a/test/fuzz/utils.hpp b/test/fuzz/utils.hpp
new file mode 100644
index 0000000000..330473a4b4
--- /dev/null
+++ b/test/fuzz/utils.hpp
@@ -0,0 +1,199 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <vector>
+
+namespace fuzz {
+
+enum FuzzerAPICall {
+    UR_PLATFORM_GET,
+    UR_DEVICE_GET,
+    UR_DEVICE_RELEASE,
+    UR_CONTEXT_CREATE,
+    UR_CONTEXT_RELEASE,
+    UR_USM_POOL_CREATE_HOST,
+    UR_USM_POOL_CREATE_DEVICE,
+    UR_USM_POOL_RELEASE_HOST,
+    UR_USM_POOL_RELEASE_DEVICE,
+    UR_USM_HOST_ALLOC_POOL,
+    UR_USM_HOST_ALLOC_NO_POOL,
+    UR_USM_DEVICE_ALLOC_POOL,
+    UR_USM_DEVICE_ALLOC_NO_POOL,
+    UR_USM_FREE_HOST_POOL,
+    UR_USM_FREE_HOST_NO_POOL,
+    UR_USM_FREE_DEVICE_POOL,
+    UR_USM_FREE_DEVICE_NO_POOL,
+    UR_PROGRAM_CREATE_WITH_IL,
+    UR_MAX_FUZZER_API_CALL,
+};
+
+typedef struct FuzzerInput {
+    uint8_t *data_ptr;
+    size_t data_size;
+} FuzzerInput;
+
+typedef struct TestState {
+    static constexpr uint32_t num_entries = 1;
+
+    FuzzerInput input;
+    ur_loader_config_handle_t config;
+
+    std::vector<ur_adapter_handle_t> adapters;
+    std::vector<ur_platform_handle_t> platforms;
+    std::vector<ur_device_handle_t> devices;
+    std::vector<ur_context_handle_t> contexts;
+    std::map<ur_usm_pool_handle_t, std::vector<void *>> pool_host_allocs;
+    std::map<ur_usm_pool_handle_t, std::vector<void *>> pool_device_allocs;
+    std::vector<void *> no_pool_host_allocs;
+    std::vector<void *> no_pool_device_allocs;
+    ur_device_type_t device_type = UR_DEVICE_TYPE_ALL;
+
+    uint32_t num_adapters;
+    uint32_t num_platforms;
+    uint32_t num_devices;
+
+    uint8_t platform_num;
+    uint8_t device_num;
+    uint8_t context_num;
+    uint8_t device_type_fuzz;
+} TestState;
+
+//////////////////////////////////////////////////////////////////////////////
+template <typename T> int get_next_input_data(FuzzerInput *input, T *out_data) {
+    size_t out_data_size = sizeof(out_data);
+    if (input->data_size == 0 || input->data_size < out_data_size) {
+        return -1;
+    }
+    *out_data = *input->data_ptr;
+    input->data_ptr += out_data_size;
+    input->data_size -= out_data_size;
+
+    return 0;
+}
+
+int init_random_data(TestState *state) {
+    if (state->input.data_size < 5) {
+        return -1;
+    }
+    get_next_input_data(&state->input, &state->platform_num);
+    get_next_input_data(&state->input, &state->device_num);
+    get_next_input_data(&state->input, &state->context_num);
+    get_next_input_data(&state->input, &state->device_type_fuzz);
+    if (state->device_type_fuzz < 1 || state->device_type_fuzz > 7) {
+        return -1;
+    }
+    state->device_type = static_cast<ur_device_type_t>(state->device_type_fuzz);
+
+    return 0;
+}
+
+int get_next_api_call(FuzzerInput *input) {
+    uint8_t next_api_call;
+    if (get_next_input_data(input, &next_api_call) != 0) {
+        return -1;
+    }
+    return next_api_call % UR_MAX_FUZZER_API_CALL;
+}
+
+bool check_device_exists(const TestState *state) {
+    if (state->devices.empty() || state->device_num >= state->devices.size() ||
+        state->devices[0] == nullptr) {
+        return false;
+    }
+
+    return true;
+}
+
+bool check_context_exists(const TestState *state) {
+    if (state->contexts.empty() ||
+        state->context_num >= state->contexts.size() ||
+        state->contexts[0] == nullptr) {
+        return false;
+    }
+
+    return true;
+}
+
+bool check_pool_exists(
+    const std::map<ur_usm_pool_handle_t, std::vector<void *>> *map,
+    const uint8_t pool_num) {
+    if (pool_num >= map->size()) {
+        return false;
+    }
+
+    return true;
+}
+
+auto get_map_item(std::map<ur_usm_pool_handle_t, std::vector<void *>> *map,
+                  const uint8_t item_index) {
+    auto map_it = map->begin();
+    std::advance(map_it, item_index);
+
+    return map_it;
+}
+
+int load_kernel_source(std::vector<char> &binary_out) {
+    std::string source_path = KERNEL_IL_PATH;
+
+    std::ifstream source_file;
+    source_file.open(source_path,
+                     std::ios::binary | std::ios::in | std::ios::ate);
+    if (!source_file.is_open()) {
+        std::cerr << "Failed to open a kernel source file: " << source_path
+                  << std::endl;
+        return -1;
+    }
+
+    size_t source_size = static_cast<size_t>(source_file.tellg());
+    source_file.seekg(0, std::ios::beg);
+
+    std::vector<char> device_binary(source_size);
+    source_file.read(device_binary.data(), source_size);
+    if (!source_file) {
+        source_file.close();
+        std::cerr << "failed reading kernel source data from file: "
+                  << source_path << std::endl;
+        return -1;
+    }
+    source_file.close();
+
+    binary_out = std::vector<char>(std::move(device_binary));
+
+    return 0;
+}
+
+void cleanup(TestState *state) {
+    urLoaderConfigRelease(state->config);
+
+    for (auto &[pool, allocs] : state->pool_host_allocs) {
+        for (auto &alloc : allocs) {
+            urUSMFree(state->contexts[state->context_num], alloc);
+        }
+        urUSMPoolRelease(pool);
+    }
+    for (auto &[pool, allocs] : state->pool_device_allocs) {
+        for (auto &alloc : allocs) {
+            urUSMFree(state->contexts[state->context_num], alloc);
+        }
+        urUSMPoolRelease(pool);
+    }
+    for (auto &alloc : state->no_pool_host_allocs) {
+        urUSMFree(state->contexts[state->context_num], alloc);
+    }
+    for (auto &alloc : state->no_pool_device_allocs) {
+        urUSMFree(state->contexts[state->context_num], alloc);
+    }
+    for (auto &context : state->contexts) {
+        urContextRelease(context);
+    }
+    for (auto &device : state->devices) {
+        urDeviceRelease(device);
+    }
+}
+} // namespace fuzz