From 8a8c64975eac7ac845967206b8b7cfc68cf6e00a Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 14 Jul 2023 14:45:53 +0100 Subject: [PATCH] [UR][HIP] Add ur hip target build to ur repo --- CMakeLists.txt | 1 + README.md | 2 + ...move-sycl-namespaces-from-ur-adapter.patch | 1048 ----------------- source/adapters/CMakeLists.txt | 10 +- source/adapters/hip/CMakeLists.txt | 144 +++ 5 files changed, 153 insertions(+), 1052 deletions(-) delete mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch create mode 100644 source/adapters/hip/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index bedb617d8c..42f42642a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace option(UR_BUILD_TOOLS "build ur tools" ON) option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF) option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF) +option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/README.md b/README.md index e9ccf4a3a3..a9a1e0cabb 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,8 @@ List of options provided by CMake: | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF | | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF | +| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF | +| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | **General**: diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch deleted file mode 100644 index 8153e1cb85..0000000000 --- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch +++ /dev/null @@ -1,1048 +0,0 @@ -From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001 -From: pbalcer -Date: Thu, 29 Jun 2023 15:11:43 +0200 -Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter - -This was preventing out-of-tree build of the adapter for standalone -use with unified runtime. - -Signed-off-by: Piotr Balcer ---- - .../ur/adapters/cuda/command_buffer.cpp | 52 ++--- - .../ur/adapters/cuda/common.cpp | 6 +- - .../ur/adapters/cuda/common.hpp | 5 - - .../ur/adapters/cuda/context.cpp | 2 +- - .../ur/adapters/cuda/device.cpp | 209 +++++++++--------- - .../ur/adapters/cuda/enqueue.cpp | 2 +- - .../ur/adapters/cuda/event.cpp | 17 +- - .../ur/adapters/cuda/kernel.cpp | 42 ++-- - .../ur/adapters/cuda/memory.cpp | 5 +- - .../ur/adapters/cuda/queue.cpp | 2 +- - .../ur/adapters/cuda/sampler.cpp | 2 +- - 11 files changed, 167 insertions(+), 177 deletions(-) - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -index c83e9e732303..57956cb64a67 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( - (void)hDevice; - (void)pCommandBufferDesc; - (void)phCommandBuffer; -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - (void)phEventWaitList; - (void)phEvent; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -index 86975e509725..83264160e700 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -@@ -72,17 +72,17 @@ std::string getCudaVersionString() { - return stream.str(); - } - --void sycl::detail::ur::die(const char *Message) { -+void detail::ur::die(const char *Message) { - std::cerr << "ur_die: " << Message << std::endl; - std::terminate(); - } - --void sycl::detail::ur::assertion(bool Condition, const char *Message) { -+void detail::ur::assertion(bool Condition, const char *Message) { - if (!Condition) - die(Message); - } - --void sycl::detail::ur::cuPrint(const char *Message) { -+void detail::ur::cuPrint(const char *Message) { - std::cerr << "ur_print: " << Message << std::endl; - } - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -index 5cfa609018b2..82b38c10d449 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -@@ -8,7 +8,6 @@ - #pragma once - - #include --#include - #include - - ur_result_t mapErrorUR(CUresult Result); -@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize]; - ur_result_t ErrorCode); - - /// ------ Error handling, matching OpenCL plugin semantics. --namespace sycl { --__SYCL_INLINE_VER_NAMESPACE(_V1) { - namespace detail { - namespace ur { - -@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr); - - } // namespace ur - } // namespace detail --} // __SYCL_INLINE_VER_NAMESPACE(_V1) --} // namespace sycl -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -index 74a32bdac274..2b621383da09 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( - } - case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hContext->getDevice()->get()) == CUDA_SUCCESS); -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -index 52d4e3badc8f..a81599d629a7 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -@@ -15,7 +15,7 @@ - - int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { - int value; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS); - return value; - } -@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int ComputeUnits = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ComputeUnits, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ComputeUnits >= 0); -+ detail::ur::assertion(ComputeUnits >= 0); - return ReturnValue(static_cast(ComputeUnits)); - } - case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { -@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } ReturnSizes; - - int MaxX = 0, MaxY = 0, MaxZ = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxX >= 0); -+ detail::ur::assertion(MaxX >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxY >= 0); -+ detail::ur::assertion(MaxY >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxZ >= 0); -+ detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); -@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - size_t Sizes[MaxWorkItemDimensions]; - } ReturnSizes; - int MaxX = 0, MaxY = 0, MaxZ = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxX >= 0); -+ detail::ur::assertion(MaxX >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxY >= 0); -+ detail::ur::assertion(MaxY >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxZ >= 0); -+ detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); -@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - - case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int MaxWorkGroupSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxWorkGroupSize, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); -+ detail::ur::assertion(MaxWorkGroupSize >= 0); - - return ReturnValue(size_t(MaxWorkGroupSize)); - } -@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxThreads, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; - return ReturnValue(MaxWarps); - } -@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // Volta provides independent thread scheduling - // TODO: Revisit for previous generation GPUs - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - - case UR_DEVICE_INFO_ATOMIC_64: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_BFLOAT16: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // NVIDIA devices only support one sub-group size (the warp size) - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - size_t Sizes[1] = {static_cast(WarpSize)}; - return ReturnValue(Sizes, 1); - } - case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int ClockFreq = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, -- hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ClockFreq >= 0); -+ detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq, -+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE, -+ hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(ClockFreq >= 0); - return ReturnValue(static_cast(ClockFreq) / 1000u); - } - case UR_DEVICE_INFO_ADDRESS_BITS: { -@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // CL_DEVICE_TYPE_CUSTOM. - - size_t Global = 0; -- sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == -+ CUDA_SUCCESS); - - auto QuarterGlobal = static_cast(Global / 4u); - -@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { - Enabled = true; - } else { -- sycl::detail::ur::cuPrint( -+ detail::ur::cuPrint( - "Images are not fully supported by the CUDA BE, their support is " - "disabled by default. Their partial support can be activated by " - "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " -@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexHeight >= 0); -+ detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfHeight >= 0); -+ detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - -@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexHeight >= 0); -+ detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfHeight >= 0); -+ detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - -@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { - // Take the smaller of maximum surface and maximum texture depth. - int TexDepth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexDepth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexDepth >= 0); -+ detail::ur::assertion(TexDepth >= 0); - int SurfDepth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfDepth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfDepth >= 0); -+ detail::ur::assertion(SurfDepth >= 0); - - int Min = std::min(TexDepth, SurfDepth); - -@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int MemBaseAddrAlign = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemBaseAddrAlign, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - hDevice->get()) == CUDA_SUCCESS); -@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int CacheSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(CacheSize >= 0); -+ detail::ur::assertion(CacheSize >= 0); - // The L2 cache is global to the GPU. - return ReturnValue(static_cast(CacheSize)); - } - case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t Bytes = 0; - // Runtime API has easy access to this value, driver API info is scarse. -- sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == -+ CUDA_SUCCESS); - return ReturnValue(uint64_t{Bytes}); - } - case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int ConstantMemory = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ConstantMemory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ConstantMemory >= 0); -+ detail::ur::assertion(ConstantMemory >= 0); - - return ReturnValue(static_cast(ConstantMemory)); - } -@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // CUDA has its own definition of "local memory", which maps to OpenCL's - // "private memory". - int LocalMemSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&LocalMemSize, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(LocalMemSize >= 0); -+ detail::ur::assertion(LocalMemSize >= 0); - return ReturnValue(static_cast(LocalMemSize)); - } - case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ECCEnabled = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled, -+ CU_DEVICE_ATTRIBUTE_ECC_ENABLED, -+ hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); -+ detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); - auto Result = static_cast(ECCEnabled); - return ReturnValue(Result); - } - case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int IsIntegrated = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated, -+ CU_DEVICE_ATTRIBUTE_INTEGRATED, -+ hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); -+ detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); - auto result = static_cast(IsIntegrated); - return ReturnValue(result); - } -@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_NAME: { - static constexpr size_t MaxDeviceNameLength = 256u; - char Name[MaxDeviceNameLength]; -- sycl::detail::ur::assertion( -- cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(Name, strlen(Name) + 1); - } - case UR_DEVICE_INFO_VENDOR: { -@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_VERSION: { - std::stringstream SS; - int Major; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); - SS << Major; - int Minor; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - int Major = 0; - int Minor = 0; - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { - size_t FreeMemory = 0; - size_t TotalMemory = 0; -- sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == -- CUDA_SUCCESS, -- "failed cuMemGetInfo() API."); -+ detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == -+ CUDA_SUCCESS, -+ "failed cuMemGetInfo() API."); - return ReturnValue(FreeMemory); - } - case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - // Convert kilohertz to megahertz when returning. - return ReturnValue(Value / 1000); - } - case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { -@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_DEVICE_ID: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_UUID: { - CUuuid UUID; - #if (CUDA_VERSION >= 11040) -- sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == -+ CUDA_SUCCESS); - #else -- sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == -+ CUDA_SUCCESS); - #endif - std::array Name; - std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); -@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); - - int Minor = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } else if (IsOrinAGX) { - MemoryClockKHz = 3200000; - } else { -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemoryClockKHz, - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - hDevice->get()) == CUDA_SUCCESS); -@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - if (IsOrinAGX) { - MemoryBusWidth = 256; - } else { -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemoryBusWidth, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - hDevice->get())); - -- sycl::detail::ur::assertion(MaxRegisters >= 0); -+ detail::ur::assertion(MaxRegisters >= 0); - - return ReturnValue(static_cast(MaxRegisters)); - } -@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_PCI_ADDRESS: { - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; -- sycl::detail::ur::assertion( -- cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, -+ hDevice->get()) == CUDA_SUCCESS); - // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written -- sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == -- 12); -+ detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); - return ReturnValue(AddressBuffer, - strnlen(AddressBuffer, AddressBufferSize - 1) + 1); - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -index 1cfc5cc40a4a..792f69092682 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { - case CU_AD_FORMAT_FLOAT: - return 4; - default: -- sycl::detail::ur::die("Invalid image format."); -+ detail::ur::die("Invalid image format."); - return 0; - } - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -index 8916197b73f1..066c0498f1d0 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() { - try { - EventID = Queue->getNextEventID(); - if (EventID == 0) { -- sycl::detail::ur::die( -+ detail::ur::die( - "Unrecoverable program state reached in event identifier overflow"); - } - Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); -@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, - case UR_EVENT_INFO_CONTEXT: - return ReturnValue(hEvent->getContext()); - default: -- sycl::detail::ur::die("Event info request not implemented"); -+ detail::ur::die("Event info request not implemented"); - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( - default: - break; - } -- sycl::detail::ur::die("Event Profiling info request not implemented"); -+ detail::ur::die("Event Profiling info request not implemented"); - return {}; - } - -@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, - ur_execution_info_t, - ur_event_callback_t, - void *) { -- sycl::detail::ur::die("Event Callback not implemented in CUDA adapter"); -+ detail::ur::die("Event Callback not implemented in CUDA adapter"); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { - - const auto RefCount = hEvent->incrementReferenceCount(); - -- sycl::detail::ur::assertion( -- RefCount != 0, "Reference count overflow detected in urEventRetain."); -+ detail::ur::assertion(RefCount != 0, -+ "Reference count overflow detected in urEventRetain."); - - return UR_RESULT_SUCCESS; - } -@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. -- sycl::detail::ur::assertion( -- hEvent->getReferenceCount() != 0, -- "Reference count overflow detected in urEventRelease."); -+ detail::ur::assertion(hEvent->getReferenceCount() != 0, -+ "Reference count overflow detected in urEventRelease."); - - // decrement ref count. If it is 0, delete the event. - if (hEvent->decrementReferenceCount() == 0) { -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -index 358f59c499e1..7d46ce039bab 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - size_t GlobalWorkSize[3] = {0, 0, 0}; - - int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); - - int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); - -@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - } - case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(size_t(MaxThreads)); -@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int Bytes = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(Bytes)); -@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int Bytes = 0; -- sycl::detail::ur::assertion( -- cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, -- hKernel->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuFuncGetAttribute(&Bytes, -+ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, -+ hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(Bytes)); - } - default: -@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, - return ReturnValue(""); - case UR_KERNEL_INFO_NUM_REGS: { - int NumRegs = 0; -- sycl::detail::ur::assertion( -- cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, -- hKernel->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuFuncGetAttribute(&NumRegs, -+ CU_FUNC_ATTRIBUTE_NUM_REGS, -+ hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(NumRegs)); - } - default: -@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { - // Sub-group size is equivalent to warp size - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hKernel->get()) == CUDA_SUCCESS); - int WarpSize = 0; -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -index b19acea3159f..f0c276579476 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { - // error for which it is unclear if the function that reported it succeeded - // or not. Either way, the state of the program is compromised and likely - // unrecoverable. -- sycl::detail::ur::die( -- "Unrecoverable program state reached in urMemRelease"); -+ detail::ur::die("Unrecoverable program state reached in urMemRelease"); - } - - return UR_RESULT_SUCCESS; -@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( - PixelTypeSizeBytes = 4; - break; - default: -- sycl::detail::ur::die( -+ detail::ur::die( - "urMemImageCreate given unsupported image_channel_data_type"); - } - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -index 05443eeed89d..32391fec5c13 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( - else if (CuFlags == CU_STREAM_NON_BLOCKING) - Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; - else -- sycl::detail::ur::die("Unknown cuda stream"); -+ detail::ur::die("Unknown cuda stream"); - - std::vector ComputeCuStreams(1, CuStream); - std::vector TransferCuStreams(0); -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -index 36ec89fb9da3..836e47f988e5 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) { - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - hSampler->getReferenceCount() != 0, - "Reference count overflow detected in urSamplerRelease."); - --- -2.41.0 - diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 9a817f4abe..543f526032 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -6,18 +6,16 @@ add_subdirectory(null) -if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) +if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") - FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) + FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) get_program_version_major_minor(git GIT_VERSION) set(GIT_QUIET_OPTION "") if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0") set(GIT_QUIET_OPTION "--quiet") endif() - execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch - WORKING_DIRECTORY ${SYCL_ADAPTER_DIR}) endif() if(UR_BUILD_ADAPTER_L0) @@ -27,3 +25,7 @@ endif() if(UR_BUILD_ADAPTER_CUDA) add_subdirectory(cuda) endif() + +if(UR_BUILD_ADAPTER_HIP) +add_subdirectory(hip) +endif() diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt new file mode 100644 index 0000000000..7b7f10d227 --- /dev/null +++ b/source/adapters/hip/CMakeLists.txt @@ -0,0 +1,144 @@ +# Copyright (C) 2022 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(HIP_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/hip") + +set(TARGET_NAME ur_adapter_hip) + +# Set default UR HIP platform to AMD +set(UR_HIP_PLATFORM "AMD" CACHE STRING "UR HIP platform, AMD or NVIDIA") + +# Set default ROCm installation directory +set(UR_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir") + +set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include") + +set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include") + +# Set HIP lib dir +set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib") + +# Check if HIP library path exists (AMD platform only) +if("${UR_HIP_PLATFORM}" STREQUAL "AMD") + if(NOT EXISTS "${UR_HIP_LIB_DIR}") + message(FATAL_ERROR "Couldn't find the HIP library directory at '${UR_HIP_LIB_DIR}'," + " please check ROCm installation.") + endif() + # Check if HIP include path exists + if(NOT EXISTS "${UR_HIP_INCLUDE_DIR}") + message(FATAL_ERROR "Couldn't find the HIP include directory at '${UR_HIP_INCLUDE_DIR}'," + " please check ROCm installation.") + endif() + + # Check if HSA include path exists + if(NOT EXISTS "${UR_HIP_HSA_INCLUDE_DIR}") + message(FATAL_ERROR "Couldn't find the HSA include directory at '${UR_HIP_HSA_INCLUDE_DIR}'," + " please check ROCm installation.") + endif() +endif() + +# Set includes used in added library (rocmdrv) +set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}") + +add_library(${TARGET_NAME} + SHARED + ${HIP_DIR}/ur_interface_loader.cpp + ${HIP_DIR}/common.hpp + ${HIP_DIR}/common.cpp + ${HIP_DIR}/context.hpp + ${HIP_DIR}/context.cpp + ${HIP_DIR}/device.hpp + ${HIP_DIR}/device.cpp + ${HIP_DIR}/enqueue.cpp + ${HIP_DIR}/event.hpp + ${HIP_DIR}/event.cpp + ${HIP_DIR}/kernel.hpp + ${HIP_DIR}/kernel.cpp + ${HIP_DIR}/memory.hpp + ${HIP_DIR}/memory.cpp + ${HIP_DIR}/platform.hpp + ${HIP_DIR}/platform.cpp + ${HIP_DIR}/program.hpp + ${HIP_DIR}/program.cpp + ${HIP_DIR}/queue.hpp + ${HIP_DIR}/queue.cpp + ${HIP_DIR}/sampler.hpp + ${HIP_DIR}/sampler.cpp + ${HIP_DIR}/usm.cpp + ${HIP_DIR}/../../ur.cpp + ${HIP_DIR}/../../ur.hpp + ${HIP_DIR}/../../usm_allocator.cpp + ${HIP_DIR}/../../usm_allocator.hpp + ${HIP_DIR}/../../usm_allocator_config.cpp + ${HIP_DIR}/../../usm_allocator_config.hpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +if("${UR_HIP_PLATFORM}" STREQUAL "AMD") + # Import HIP runtime library + add_library(rocmdrv SHARED IMPORTED GLOBAL) + + set_target_properties( + rocmdrv PROPERTIES + IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamdhip64.so" + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + + target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + rocmdrv + ) + + # Set HIP define to select AMD platform + target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_AMD__) +elseif("${UR_HIP_PLATFORM}" STREQUAL "NVIDIA") + # Import CUDA libraries + find_package(CUDA REQUIRED) + find_package(Threads REQUIRED) + + list(APPEND HIP_HEADERS ${CUDA_INCLUDE_DIRS}) + + # cudadrv may be defined by the CUDA plugin + if(NOT TARGET cudadrv) + add_library(cudadrv SHARED IMPORTED GLOBAL) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + endif() + + add_library(cudart SHARED IMPORTED GLOBAL) + set_target_properties( + cudart PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDART_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + + target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + Threads::Threads + cudadrv + cudart + ) + + # Set HIP define to select NVIDIA platform + target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_NVIDIA__) +else() + message(FATAL_ERROR "Unspecified UR HIP platform please set UR_HIP_PLATFORM to 'AMD' or 'NVIDIA'") +endif() + +target_include_directories(${TARGET_NAME} PRIVATE + ${HIP_DIR}/../../../ +)