From 8a8c64975eac7ac845967206b8b7cfc68cf6e00a Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Fri, 14 Jul 2023 14:45:53 +0100
Subject: [PATCH] [UR][HIP] Add ur hip target build to ur repo

---
 CMakeLists.txt                                |    1 +
 README.md                                     |    2 +
 ...move-sycl-namespaces-from-ur-adapter.patch | 1048 -----------------
 source/adapters/CMakeLists.txt                |   10 +-
 source/adapters/hip/CMakeLists.txt            |  144 +++
 5 files changed, 153 insertions(+), 1052 deletions(-)
 delete mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
 create mode 100644 source/adapters/hip/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bedb617d8c..42f42642a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace
 option(UR_BUILD_TOOLS "build ur tools" ON)
 option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
 option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF)
+option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/README.md b/README.md
index e9ccf4a3a3..a9a1e0cabb 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,8 @@ List of options provided by CMake:
 | UR_BUILD_TOOLS | Build tools | ON/OFF | ON |
 | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF |
 | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
+| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
+| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
 
 **General**:
 
diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
deleted file mode 100644
index 8153e1cb85..0000000000
--- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
+++ /dev/null
@@ -1,1048 +0,0 @@
-From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001
-From: pbalcer <piotr.balcer@intel.com>
-Date: Thu, 29 Jun 2023 15:11:43 +0200
-Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter
-
-This was preventing out-of-tree build of the adapter for standalone
-use with unified runtime.
-
-Signed-off-by: Piotr Balcer <piotr.balcer@intel.com>
----
- .../ur/adapters/cuda/command_buffer.cpp       |  52 ++---
- .../ur/adapters/cuda/common.cpp               |   6 +-
- .../ur/adapters/cuda/common.hpp               |   5 -
- .../ur/adapters/cuda/context.cpp              |   2 +-
- .../ur/adapters/cuda/device.cpp               | 209 +++++++++---------
- .../ur/adapters/cuda/enqueue.cpp              |   2 +-
- .../ur/adapters/cuda/event.cpp                |  17 +-
- .../ur/adapters/cuda/kernel.cpp               |  42 ++--
- .../ur/adapters/cuda/memory.cpp               |   5 +-
- .../ur/adapters/cuda/queue.cpp                |   2 +-
- .../ur/adapters/cuda/sampler.cpp              |   2 +-
- 11 files changed, 167 insertions(+), 177 deletions(-)
-
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-index c83e9e732303..57956cb64a67 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
-@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
-   (void)hDevice;
-   (void)pCommandBufferDesc;
-   (void)phCommandBuffer;
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
- urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-   (void)hCommandBuffer;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp(
-   (void)pSyncPointWaitList;
-   (void)pSyncPoint;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
-   (void)phEventWaitList;
-   (void)phEvent;
- 
--  sycl::detail::ur::die("Experimental Command-buffer feature is not "
--                        "implemented for CUDA adapter.");
-+  detail::ur::die("Experimental Command-buffer feature is not "
-+                  "implemented for CUDA adapter.");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-index 86975e509725..83264160e700 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
-@@ -72,17 +72,17 @@ std::string getCudaVersionString() {
-   return stream.str();
- }
- 
--void sycl::detail::ur::die(const char *Message) {
-+void detail::ur::die(const char *Message) {
-   std::cerr << "ur_die: " << Message << std::endl;
-   std::terminate();
- }
- 
--void sycl::detail::ur::assertion(bool Condition, const char *Message) {
-+void detail::ur::assertion(bool Condition, const char *Message) {
-   if (!Condition)
-     die(Message);
- }
- 
--void sycl::detail::ur::cuPrint(const char *Message) {
-+void detail::ur::cuPrint(const char *Message) {
-   std::cerr << "ur_print: " << Message << std::endl;
- }
- 
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-index 5cfa609018b2..82b38c10d449 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
-@@ -8,7 +8,6 @@
- #pragma once
- 
- #include <cuda.h>
--#include <sycl/detail/defines.hpp>
- #include <ur/ur.hpp>
- 
- ur_result_t mapErrorUR(CUresult Result);
-@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize];
-                                       ur_result_t ErrorCode);
- 
- /// ------ Error handling, matching OpenCL plugin semantics.
--namespace sycl {
--__SYCL_INLINE_VER_NAMESPACE(_V1) {
- namespace detail {
- namespace ur {
- 
-@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr);
- 
- } // namespace ur
- } // namespace detail
--} // __SYCL_INLINE_VER_NAMESPACE(_V1)
--} // namespace sycl
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-index 74a32bdac274..2b621383da09 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
-@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
-   }
-   case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hContext->getDevice()->get()) == CUDA_SUCCESS);
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-index 52d4e3badc8f..a81599d629a7 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
-@@ -15,7 +15,7 @@
- 
- int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
-   int value;
--  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
-   return value;
- }
-@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-     int ComputeUnits = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ComputeUnits,
-                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ComputeUnits >= 0);
-+    detail::ur::assertion(ComputeUnits >= 0);
-     return ReturnValue(static_cast<uint32_t>(ComputeUnits));
-   }
-   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
-@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     } ReturnSizes;
- 
-     int MaxX = 0, MaxY = 0, MaxZ = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxX >= 0);
-+    detail::ur::assertion(MaxX >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxY >= 0);
-+    detail::ur::assertion(MaxY >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxZ >= 0);
-+    detail::ur::assertion(MaxZ >= 0);
- 
-     ReturnSizes.Sizes[0] = size_t(MaxX);
-     ReturnSizes.Sizes[1] = size_t(MaxY);
-@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-       size_t Sizes[MaxWorkItemDimensions];
-     } ReturnSizes;
-     int MaxX = 0, MaxY = 0, MaxZ = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxX >= 0);
-+    detail::ur::assertion(MaxX >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxY >= 0);
-+    detail::ur::assertion(MaxY >= 0);
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(MaxZ >= 0);
-+    detail::ur::assertion(MaxZ >= 0);
- 
-     ReturnSizes.Sizes[0] = size_t(MaxX);
-     ReturnSizes.Sizes[1] = size_t(MaxY);
-@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
- 
-   case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
-     int MaxWorkGroupSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxWorkGroupSize,
-                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion(MaxWorkGroupSize >= 0);
-+    detail::ur::assertion(MaxWorkGroupSize >= 0);
- 
-     return ReturnValue(size_t(MaxWorkGroupSize));
-   }
-@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
-     // Number of sub-groups = max block size / warp size + possible remainder
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxThreads,
-                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
-     return ReturnValue(MaxWarps);
-   }
-@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // Volta provides independent thread scheduling
-     // TODO: Revisit for previous generation GPUs
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
- 
-   case UR_DEVICE_INFO_ATOMIC_64: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_BFLOAT16: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-     // NVIDIA devices only support one sub-group size (the warp size)
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
-     return ReturnValue(Sizes, 1);
-   }
-   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
-     int ClockFreq = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
--                             hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ClockFreq >= 0);
-+    detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq,
-+                                               CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(ClockFreq >= 0);
-     return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
-   }
-   case UR_DEVICE_INFO_ADDRESS_BITS: {
-@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // CL_DEVICE_TYPE_CUSTOM.
- 
-     size_t Global = 0;
--    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- 
-     auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
- 
-@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
-       Enabled = true;
-     } else {
--      sycl::detail::ur::cuPrint(
-+      detail::ur::cuPrint(
-           "Images are not fully supported by the CUDA BE, their support is "
-           "disabled by default. Their partial support can be activated by "
-           "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
-@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
-     // Take the smaller of maximum surface and maximum texture height.
-     int TexHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexHeight >= 0);
-+    detail::ur::assertion(TexHeight >= 0);
-     int SurfHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfHeight >= 0);
-+    detail::ur::assertion(SurfHeight >= 0);
- 
-     int Min = std::min(TexHeight, SurfHeight);
- 
-@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
-     // Take the smaller of maximum surface and maximum texture height.
-     int TexHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexHeight >= 0);
-+    detail::ur::assertion(TexHeight >= 0);
-     int SurfHeight = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfHeight,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfHeight >= 0);
-+    detail::ur::assertion(SurfHeight >= 0);
- 
-     int Min = std::min(TexHeight, SurfHeight);
- 
-@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
-     // Take the smaller of maximum surface and maximum texture depth.
-     int TexDepth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexDepth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexDepth >= 0);
-+    detail::ur::assertion(TexDepth >= 0);
-     int SurfDepth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfDepth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfDepth >= 0);
-+    detail::ur::assertion(SurfDepth >= 0);
- 
-     int Min = std::min(TexDepth, SurfDepth);
- 
-@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
-     // Take the smaller of maximum surface and maximum texture width.
-     int TexWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&TexWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(TexWidth >= 0);
-+    detail::ur::assertion(TexWidth >= 0);
-     int SurfWidth = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&SurfWidth,
-                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(SurfWidth >= 0);
-+    detail::ur::assertion(SurfWidth >= 0);
- 
-     int Min = std::min(TexWidth, SurfWidth);
- 
-@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
-     int MemBaseAddrAlign = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MemBaseAddrAlign,
-                              CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
-     int CacheSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(CacheSize >= 0);
-+    detail::ur::assertion(CacheSize >= 0);
-     // The L2 cache is global to the GPU.
-     return ReturnValue(static_cast<uint64_t>(CacheSize));
-   }
-   case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-     size_t Bytes = 0;
-     // Runtime API has easy access to this value, driver API info is scarse.
--    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
-+                          CUDA_SUCCESS);
-     return ReturnValue(uint64_t{Bytes});
-   }
-   case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
-     int ConstantMemory = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&ConstantMemory,
-                              CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(ConstantMemory >= 0);
-+    detail::ur::assertion(ConstantMemory >= 0);
- 
-     return ReturnValue(static_cast<uint64_t>(ConstantMemory));
-   }
-@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     // CUDA has its own definition of "local memory", which maps to OpenCL's
-     // "private memory".
-     int LocalMemSize = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&LocalMemSize,
-                              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(LocalMemSize >= 0);
-+    detail::ur::assertion(LocalMemSize >= 0);
-     return ReturnValue(static_cast<uint64_t>(LocalMemSize));
-   }
-   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
-     int ECCEnabled = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled,
-+                                               CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-+                                               hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
-+    detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
-     auto Result = static_cast<bool>(ECCEnabled);
-     return ReturnValue(Result);
-   }
-   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
-     int IsIntegrated = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated,
-+                                               CU_DEVICE_ATTRIBUTE_INTEGRATED,
-+                                               hDevice->get()) == CUDA_SUCCESS);
- 
--    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
-+    detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
-     auto result = static_cast<bool>(IsIntegrated);
-     return ReturnValue(result);
-   }
-@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_NAME: {
-     static constexpr size_t MaxDeviceNameLength = 256u;
-     char Name[MaxDeviceNameLength];
--    sycl::detail::ur::assertion(
--        cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
--        CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength,
-+                                          hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(Name, strlen(Name) + 1);
-   }
-   case UR_DEVICE_INFO_VENDOR: {
-@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_VERSION: {
-     std::stringstream SS;
-     int Major;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-     SS << Major;
-     int Minor;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     int Major = 0;
-     int Minor = 0;
- 
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
-     size_t FreeMemory = 0;
-     size_t TotalMemory = 0;
--    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
--                                    CUDA_SUCCESS,
--                                "failed cuMemGetInfo() API.");
-+    detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
-+                              CUDA_SUCCESS,
-+                          "failed cuMemGetInfo() API.");
-     return ReturnValue(FreeMemory);
-   }
-   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     // Convert kilohertz to megahertz when returning.
-     return ReturnValue(Value / 1000);
-   }
-   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value,
-                              CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     return ReturnValue(Value);
-   }
-   case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_DEVICE_ID: {
-     int Value = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(Value >= 0);
-+    detail::ur::assertion(Value >= 0);
-     return ReturnValue(Value);
-   }
-   case UR_DEVICE_INFO_UUID: {
-     CUuuid UUID;
- #if (CUDA_VERSION >= 11040)
--    sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- #else
--    sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
--                                CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
-+                          CUDA_SUCCESS);
- #endif
-     std::array<unsigned char, 16> Name;
-     std::copy(UUID.bytes, UUID.bytes + 16, Name.begin());
-@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   }
-   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
-     int Major = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Major,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-     int Minor = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&Minor,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              hDevice->get()) == CUDA_SUCCESS);
-@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     } else if (IsOrinAGX) {
-       MemoryClockKHz = 3200000;
-     } else {
--      sycl::detail::ur::assertion(
-+      detail::ur::assertion(
-           cuDeviceGetAttribute(&MemoryClockKHz,
-                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                                hDevice->get()) == CUDA_SUCCESS);
-@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-     if (IsOrinAGX) {
-       MemoryBusWidth = 256;
-     } else {
--      sycl::detail::ur::assertion(
-+      detail::ur::assertion(
-           cuDeviceGetAttribute(&MemoryBusWidth,
-                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                                hDevice->get()) == CUDA_SUCCESS);
-@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-         &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-         hDevice->get()));
- 
--    sycl::detail::ur::assertion(MaxRegisters >= 0);
-+    detail::ur::assertion(MaxRegisters >= 0);
- 
-     return ReturnValue(static_cast<uint32_t>(MaxRegisters));
-   }
-@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
-   case UR_DEVICE_INFO_PCI_ADDRESS: {
-     constexpr size_t AddressBufferSize = 13;
-     char AddressBuffer[AddressBufferSize];
--    sycl::detail::ur::assertion(
--        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
--        CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize,
-+                                              hDevice->get()) == CUDA_SUCCESS);
-     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
--    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
--                                12);
-+    detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
-     return ReturnValue(AddressBuffer,
-                        strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
-   }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-index 1cfc5cc40a4a..792f69092682 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
-@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
-   case CU_AD_FORMAT_FLOAT:
-     return 4;
-   default:
--    sycl::detail::ur::die("Invalid image format.");
-+    detail::ur::die("Invalid image format.");
-     return 0;
-   }
- }
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-index 8916197b73f1..066c0498f1d0 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
-@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() {
-   try {
-     EventID = Queue->getNextEventID();
-     if (EventID == 0) {
--      sycl::detail::ur::die(
-+      detail::ur::die(
-           "Unrecoverable program state reached in event identifier overflow");
-     }
-     Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream));
-@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
-   case UR_EVENT_INFO_CONTEXT:
-     return ReturnValue(hEvent->getContext());
-   default:
--    sycl::detail::ur::die("Event info request not implemented");
-+    detail::ur::die("Event info request not implemented");
-   }
- 
-   return UR_RESULT_ERROR_INVALID_ENUMERATION;
-@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
-   default:
-     break;
-   }
--  sycl::detail::ur::die("Event Profiling info request not implemented");
-+  detail::ur::die("Event Profiling info request not implemented");
-   return {};
- }
- 
-@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
-                                                        ur_execution_info_t,
-                                                        ur_event_callback_t,
-                                                        void *) {
--  sycl::detail::ur::die("Event Callback not implemented in CUDA adapter");
-+  detail::ur::die("Event Callback not implemented in CUDA adapter");
-   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
- }
- 
-@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
- 
-   const auto RefCount = hEvent->incrementReferenceCount();
- 
--  sycl::detail::ur::assertion(
--      RefCount != 0, "Reference count overflow detected in urEventRetain.");
-+  detail::ur::assertion(RefCount != 0,
-+                        "Reference count overflow detected in urEventRetain.");
- 
-   return UR_RESULT_SUCCESS;
- }
-@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
- 
-   // double delete or someone is messing with the ref count.
-   // either way, cannot safely proceed.
--  sycl::detail::ur::assertion(
--      hEvent->getReferenceCount() != 0,
--      "Reference count overflow detected in urEventRelease.");
-+  detail::ur::assertion(hEvent->getReferenceCount() != 0,
-+                        "Reference count overflow detected in urEventRelease.");
- 
-   // decrement ref count. If it is 0, delete the event.
-   if (hEvent->decrementReferenceCount() == 0) {
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-index 358f59c499e1..7d46ce039bab 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
-@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-     size_t GlobalWorkSize[3] = {0, 0, 0};
- 
-     int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-     int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                              hDevice->get()) == CUDA_SUCCESS);
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                              hDevice->get()) == CUDA_SUCCESS);
- 
-@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   }
-   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                            hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(size_t(MaxThreads));
-@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-     // OpenCL LOCAL == CUDA SHARED
-     int Bytes = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                            hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(uint64_t(Bytes));
-@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-     // Work groups should be multiples of the warp size
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<size_t>(WarpSize));
-   }
-   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-     // OpenCL PRIVATE == CUDA LOCAL
-     int Bytes = 0;
--    sycl::detail::ur::assertion(
--        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
--                           hKernel->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuFuncGetAttribute(&Bytes,
-+                                             CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-+                                             hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(uint64_t(Bytes));
-   }
-   default:
-@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
-     return ReturnValue("");
-   case UR_KERNEL_INFO_NUM_REGS: {
-     int NumRegs = 0;
--    sycl::detail::ur::assertion(
--        cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
--                           hKernel->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuFuncGetAttribute(&NumRegs,
-+                                             CU_FUNC_ATTRIBUTE_NUM_REGS,
-+                                             hKernel->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<uint32_t>(NumRegs));
-   }
-   default:
-@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
-   case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
-     // Sub-group size is equivalent to warp size
-     int WarpSize = 0;
--    sycl::detail::ur::assertion(
--        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
--                             hDevice->get()) == CUDA_SUCCESS);
-+    detail::ur::assertion(cuDeviceGetAttribute(&WarpSize,
-+                                               CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-+                                               hDevice->get()) == CUDA_SUCCESS);
-     return ReturnValue(static_cast<uint32_t>(WarpSize));
-   }
-   case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
-     // Number of sub-groups = max block size / warp size + possible remainder
-     int MaxThreads = 0;
--    sycl::detail::ur::assertion(
-+    detail::ur::assertion(
-         cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                            hKernel->get()) == CUDA_SUCCESS);
-     int WarpSize = 0;
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-index b19acea3159f..f0c276579476 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
-@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
-     // error for which it is unclear if the function that reported it succeeded
-     // or not. Either way, the state of the program is compromised and likely
-     // unrecoverable.
--    sycl::detail::ur::die(
--        "Unrecoverable program state reached in urMemRelease");
-+    detail::ur::die("Unrecoverable program state reached in urMemRelease");
-   }
- 
-   return UR_RESULT_SUCCESS;
-@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
-     PixelTypeSizeBytes = 4;
-     break;
-   default:
--    sycl::detail::ur::die(
-+    detail::ur::die(
-         "urMemImageCreate given unsupported image_channel_data_type");
-   }
- 
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-index 05443eeed89d..32391fec5c13 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
-@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
-   else if (CuFlags == CU_STREAM_NON_BLOCKING)
-     Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM;
-   else
--    sycl::detail::ur::die("Unknown cuda stream");
-+    detail::ur::die("Unknown cuda stream");
- 
-   std::vector<CUstream> ComputeCuStreams(1, CuStream);
-   std::vector<CUstream> TransferCuStreams(0);
-diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-index 36ec89fb9da3..836e47f988e5 100644
---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
-@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) {
- 
-   // double delete or someone is messing with the ref count.
-   // either way, cannot safely proceed.
--  sycl::detail::ur::assertion(
-+  detail::ur::assertion(
-       hSampler->getReferenceCount() != 0,
-       "Reference count overflow detected in urSamplerRelease.");
- 
--- 
-2.41.0
-
diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
index 9a817f4abe..543f526032 100644
--- a/source/adapters/CMakeLists.txt
+++ b/source/adapters/CMakeLists.txt
@@ -6,18 +6,16 @@
 add_subdirectory(null)
 
 
-if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA)
+if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP)
     # fetch adapter sources from SYCL
     set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
-    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
+    FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR})
 
     get_program_version_major_minor(git GIT_VERSION)
     set(GIT_QUIET_OPTION "")
     if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0")
         set(GIT_QUIET_OPTION "--quiet")
     endif()
-    execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch
-        WORKING_DIRECTORY ${SYCL_ADAPTER_DIR})
 endif()
 
 if(UR_BUILD_ADAPTER_L0)
@@ -27,3 +25,7 @@ endif()
 if(UR_BUILD_ADAPTER_CUDA)
 add_subdirectory(cuda)
 endif()
+
+if(UR_BUILD_ADAPTER_HIP)
+add_subdirectory(hip)
+endif()
diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
new file mode 100644
index 0000000000..7b7f10d227
--- /dev/null
+++ b/source/adapters/hip/CMakeLists.txt
@@ -0,0 +1,144 @@
+# Copyright (C) 2022 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(HIP_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/hip")
+
+set(TARGET_NAME ur_adapter_hip)
+
+# Set default UR HIP platform to AMD
+set(UR_HIP_PLATFORM "AMD" CACHE STRING "UR HIP platform, AMD or NVIDIA")
+
+# Set default ROCm installation directory
+set(UR_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir")
+
+set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
+
+set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")
+
+# Set HIP lib dir
+set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
+
+# Check if HIP library path exists (AMD platform only)
+if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
+  if(NOT EXISTS "${UR_HIP_LIB_DIR}")
+    message(FATAL_ERROR "Couldn't find the HIP library directory at '${UR_HIP_LIB_DIR}',"
+                        " please check ROCm installation.")
+  endif()
+  # Check if HIP include path exists
+  if(NOT EXISTS "${UR_HIP_INCLUDE_DIR}")
+  message(FATAL_ERROR "Couldn't find the HIP include directory at '${UR_HIP_INCLUDE_DIR}',"
+                      " please check ROCm installation.")
+  endif()
+
+  # Check if HSA include path exists
+  if(NOT EXISTS "${UR_HIP_HSA_INCLUDE_DIR}")
+  message(FATAL_ERROR "Couldn't find the HSA include directory at '${UR_HIP_HSA_INCLUDE_DIR}',"
+                      " please check ROCm installation.")
+  endif()
+endif()
+
+# Set includes used in added library (rocmdrv)
+set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}")
+
+add_library(${TARGET_NAME}
+    SHARED
+    ${HIP_DIR}/ur_interface_loader.cpp
+    ${HIP_DIR}/common.hpp
+    ${HIP_DIR}/common.cpp
+    ${HIP_DIR}/context.hpp
+    ${HIP_DIR}/context.cpp
+    ${HIP_DIR}/device.hpp
+    ${HIP_DIR}/device.cpp
+    ${HIP_DIR}/enqueue.cpp
+    ${HIP_DIR}/event.hpp
+    ${HIP_DIR}/event.cpp
+    ${HIP_DIR}/kernel.hpp
+    ${HIP_DIR}/kernel.cpp
+    ${HIP_DIR}/memory.hpp
+    ${HIP_DIR}/memory.cpp
+    ${HIP_DIR}/platform.hpp
+    ${HIP_DIR}/platform.cpp
+    ${HIP_DIR}/program.hpp
+    ${HIP_DIR}/program.cpp
+    ${HIP_DIR}/queue.hpp
+    ${HIP_DIR}/queue.cpp
+    ${HIP_DIR}/sampler.hpp
+    ${HIP_DIR}/sampler.cpp
+    ${HIP_DIR}/usm.cpp
+    ${HIP_DIR}/../../ur.cpp
+    ${HIP_DIR}/../../ur.hpp
+    ${HIP_DIR}/../../usm_allocator.cpp
+    ${HIP_DIR}/../../usm_allocator.hpp
+    ${HIP_DIR}/../../usm_allocator_config.cpp
+    ${HIP_DIR}/../../usm_allocator_config.hpp
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
+    SOVERSION "${PROJECT_VERSION_MAJOR}"
+)
+
+if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
+  # Import HIP runtime library
+  add_library(rocmdrv SHARED IMPORTED GLOBAL)
+
+  set_target_properties(
+    rocmdrv PROPERTIES
+      IMPORTED_LOCATION                    "${UR_HIP_LIB_DIR}/libamdhip64.so"
+      INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+      INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+  )
+
+  target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    rocmdrv
+  )
+
+  # Set HIP define to select AMD platform
+  target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_AMD__)
+elseif("${UR_HIP_PLATFORM}" STREQUAL "NVIDIA")
+  # Import CUDA libraries
+  find_package(CUDA REQUIRED)
+  find_package(Threads REQUIRED)
+
+  list(APPEND HIP_HEADERS ${CUDA_INCLUDE_DIRS})
+
+  # cudadrv may be defined by the CUDA plugin
+  if(NOT TARGET cudadrv)
+    add_library(cudadrv SHARED IMPORTED GLOBAL)
+    set_target_properties(
+      cudadrv PROPERTIES
+        IMPORTED_LOCATION                    ${CUDA_CUDA_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+    )
+  endif()
+
+  add_library(cudart SHARED IMPORTED GLOBAL)
+  set_target_properties(
+    cudart PROPERTIES
+      IMPORTED_LOCATION                    ${CUDA_CUDART_LIBRARY}
+      INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+      INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+  )
+
+  target_link_libraries(${TARGET_NAME} PRIVATE
+    ${PROJECT_NAME}::headers
+    ${PROJECT_NAME}::common
+    Threads::Threads
+    cudadrv
+    cudart
+  )
+
+  # Set HIP define to select NVIDIA platform
+  target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_NVIDIA__)
+else()
+  message(FATAL_ERROR "Unspecified UR HIP platform please set UR_HIP_PLATFORM to 'AMD' or 'NVIDIA'")
+endif()
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${HIP_DIR}/../../../
+)