From f778e6f209ff6d6ce519b3c03ed6486e2d2fe2d5 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Fri, 2 Jun 2023 15:14:26 +0200 Subject: [PATCH 01/16] [ur][L0] add external level zero adapter Allows fetching and building level zero adapter without the need to use the entire SYCL codebase. Convenient for testing and prototyping of other language runtimes. --- .github/workflows/cmake.yml | 2 +- .gitignore | 3 + CMakeLists.txt | 1 + README.md | 6 ++ cmake/helpers.cmake | 28 ++--- source/adapters/CMakeLists.txt | 4 + source/adapters/level_zero/CMakeLists.txt | 119 ++++++++++++++++++++++ 7 files changed, 150 insertions(+), 13 deletions(-) create mode 100644 source/adapters/level_zero/CMakeLists.txt diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 75f1de272d..618c099a21 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -55,10 +55,10 @@ jobs: -DCMAKE_C_COMPILER=${{matrix.compiler.c}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} -DUR_ENABLE_TRACING=ON - -DUR_DEVELOPER_MODE=ON -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DUR_BUILD_TESTS=ON -DUR_FORMAT_CPP_STYLE=ON + -DUR_BUILD_ADAPTER_L0=ON ${{matrix.libbacktrace}} - name: Generate source from spec, check for uncommitted diff diff --git a/.gitignore b/.gitignore index a1a488bc14..263b72d4d4 100644 --- a/.gitignore +++ b/.gitignore @@ -83,3 +83,6 @@ out/ # IDE Files /.vscode /.devcontainer + +# External content +/external diff --git a/CMakeLists.txt b/CMakeLists.txt index efb9565171..352965ca90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ option(UMA_BUILD_SHARED_LIBRARY "Build UMA as shared library" OFF) option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) option(UR_BUILD_TOOLS "build ur tools" ON) +option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/README.md b/README.md index 6b34d4d299..c178788ceb 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,11 @@ [![GHA build status](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg?branch=main)](https://github.com/oneapi-src/unified-runtime/actions) +## Adapters +Adapter implementations for Unified Runtime currently reside in the [SYCL repository](https://github.com/intel/llvm/tree/sycl/sycl/plugins/unified_runtime/ur). This branch contains scripts to automatically +fetch and build them directly in the UR tree. The adapters are disabled by default, +see cmake options for details. + ## Contents This repo contains the following: @@ -99,6 +104,7 @@ List of options provided by CMake: | UR_USE_MSAN | Enable MemorySanitizer (clang only) | ON/OFF | OFF | | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF | | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | +| UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF | **General**: diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index 17ac97cd5f..a99286434a 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -57,22 +57,26 @@ endmacro() include(FetchContent) -# A wrapper around FetchContent_Declare that supports git sparse checkout. -# This is useful for including subprojects from large repositories. -function(FetchContentSparse_Declare name GIT_REPOSITORY GIT_TAG GIT_DIR) - set(content-build-dir ${CMAKE_BINARY_DIR}/content-${name}) - message(STATUS "Fetching sparse content ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}") - IF(NOT EXISTS ${content-build-dir}) - file(MAKE_DIRECTORY ${content-build-dir}) +function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST) +message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}") + IF(NOT EXISTS ${DEST}) + file(MAKE_DIRECTORY ${DEST}) execute_process(COMMAND git init -b main - WORKING_DIRECTORY ${content-build-dir}) + WORKING_DIRECTORY ${DEST}) execute_process(COMMAND git remote add origin ${GIT_REPOSITORY} - WORKING_DIRECTORY ${content-build-dir}) + WORKING_DIRECTORY ${DEST}) execute_process(COMMAND git config core.sparsecheckout true - WORKING_DIRECTORY ${content-build-dir}) - file(APPEND ${content-build-dir}/.git/info/sparse-checkout ${GIT_DIR}/) + WORKING_DIRECTORY ${DEST}) + file(APPEND ${DEST}/.git/info/sparse-checkout ${GIT_DIR}/) endif() execute_process(COMMAND git pull --depth=1 origin ${GIT_TAG} - WORKING_DIRECTORY ${content-build-dir}) + WORKING_DIRECTORY ${DEST}) +endfunction() + +# A wrapper around FetchContent_Declare that supports git sparse checkout. +# This is useful for including subprojects from large repositories. +function(FetchContentSparse_Declare name GIT_REPOSITORY GIT_TAG GIT_DIR) + set(content-build-dir ${CMAKE_BINARY_DIR}/content-${name}) + FetchSource(${GIT_REPOSITORY} ${GIT_TAG} ${GIT_DIR} ${content-build-dir}) FetchContent_Declare(${name} SOURCE_DIR ${content-build-dir}/${GIT_DIR}) endfunction() diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 7d7c0429e2..9964333e4a 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -2,3 +2,7 @@ # SPDX-License-Identifier: MIT add_subdirectory(null) + +if(UR_BUILD_ADAPTER_L0) +add_subdirectory(level_zero) +endif() diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt new file mode 100644 index 0000000000..4c672b7c69 --- /dev/null +++ b/source/adapters/level_zero/CMakeLists.txt @@ -0,0 +1,119 @@ +# Copyright (C) 2022 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") +set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero") + +# fetch xpti proxy library for the tracing layer +FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230601 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR}) + +set(TARGET_NAME ur_adapter_level_zero) + +if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) + message(STATUS "Download Level Zero loader and headers from github.com") + + set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") + set(LEVEL_ZERO_LOADER_TAG v1.8.8) + + # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 + set(CMAKE_INCLUDE_CURRENT_DIR OFF) + # Prevent L0 loader from exporting extra symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) + + message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}") + include(FetchContent) + FetchContent_Declare(level-zero-loader + GIT_REPOSITORY ${LEVEL_ZERO_LOADER_REPO} + GIT_TAG ${LEVEL_ZERO_LOADER_TAG} + ) + if (WIN32) + set(USE_Z7 ON) + endif() + FetchContent_MakeAvailable(level-zero-loader) + FetchContent_GetProperties(level-zero-loader) + + set(LEVEL_ZERO_LIBRARY ze_loader) + set(LEVEL_ZERO_INCLUDE_DIR + ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers") +endif() + +add_library (LevelZeroLoader INTERFACE) +# The MSVC linker does not like / at the start of a path, so to work around this +# we split it into a link library and a library path, where the path is allowed +# to have leading /. +get_filename_component(LEVEL_ZERO_LIBRARY_SRC "${LEVEL_ZERO_LIBRARY}" DIRECTORY) +get_filename_component(LEVEL_ZERO_LIB_NAME "${LEVEL_ZERO_LIBRARY}" NAME) +target_link_directories(LevelZeroLoader + INTERFACE "${LEVEL_ZERO_LIBRARY_SRC}" +) +target_link_libraries(LevelZeroLoader + INTERFACE "${LEVEL_ZERO_LIB_NAME}" +) +if (WIN32) + # TODO: fix building level-zero loader on Windows + # target_compile_options(${LEVEL_ZERO_LIB_NAME} /UUNICODE) +else() + target_compile_options(${LEVEL_ZERO_LIB_NAME} PUBLIC + -Wno-unused-but-set-variable + -Wno-pedantic + -Wno-unused-parameter + -Wno-error + ) +endif() + +add_library (LevelZeroLoader-Headers INTERFACE) +target_include_directories(LevelZeroLoader-Headers + INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}" +) + +add_library(${TARGET_NAME} + SHARED + ${L0_DIR}/ur_loader_interface.cpp + ${L0_DIR}/ur_level_zero_common.hpp + ${L0_DIR}/ur_level_zero_context.hpp + ${L0_DIR}/ur_level_zero_device.hpp + ${L0_DIR}/ur_level_zero_event.hpp + ${L0_DIR}/ur_level_zero_mem.hpp + ${L0_DIR}/ur_level_zero_kernel.hpp + ${L0_DIR}/ur_level_zero_platform.hpp + ${L0_DIR}/ur_level_zero_program.hpp + ${L0_DIR}/ur_level_zero_queue.hpp + ${L0_DIR}/ur_level_zero_sampler.hpp + ${L0_DIR}/ur_level_zero.cpp + ${L0_DIR}/ur_level_zero_common.cpp + ${L0_DIR}/ur_level_zero_context.cpp + ${L0_DIR}/ur_level_zero_device.cpp + ${L0_DIR}/ur_level_zero_event.cpp + ${L0_DIR}/ur_level_zero_mem.cpp + ${L0_DIR}/ur_level_zero_kernel.cpp + ${L0_DIR}/ur_level_zero_platform.cpp + ${L0_DIR}/ur_level_zero_program.cpp + ${L0_DIR}/ur_level_zero_queue.cpp + ${L0_DIR}/ur_level_zero_sampler.cpp + ${L0_DIR}/../../ur.cpp + ${L0_DIR}/../../usm_allocator_config.hpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + LevelZeroLoader-Headers +) + +target_include_directories(${TARGET_NAME} PRIVATE + ${L0_DIR}/../../../ + LevelZeroLoader-Headers +) + +if(UNIX) + set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}") +endif() + From 80e168ee95c5dc38c7d4b64354a7a7f8b3159d88 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Mon, 5 Jun 2023 12:10:44 +0200 Subject: [PATCH 02/16] [level-zero] fix build script and failing tests --- source/adapters/level_zero/CMakeLists.txt | 25 +++++++++++++++++------ test/loader/platforms/CMakeLists.txt | 2 +- test/loader/platforms/platforms.cpp | 18 ++++++++-------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 4c672b7c69..cde6148dc9 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -28,12 +28,21 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) GIT_REPOSITORY ${LEVEL_ZERO_LOADER_REPO} GIT_TAG ${LEVEL_ZERO_LOADER_TAG} ) - if (WIN32) - set(USE_Z7 ON) + set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}") + if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX-") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /WX-") + # FIXME: Unified runtime build fails with /DUNICODE + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /UUNICODE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /UUNICODE") + # USE_Z7 forces use of /Z7 instead of /Zi which is broken with sccache + set(USE_Z7 ON) endif() FetchContent_MakeAvailable(level-zero-loader) FetchContent_GetProperties(level-zero-loader) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}") + set(LEVEL_ZERO_LIBRARY ze_loader) set(LEVEL_ZERO_INCLUDE_DIR ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers") @@ -51,10 +60,8 @@ target_link_directories(LevelZeroLoader target_link_libraries(LevelZeroLoader INTERFACE "${LEVEL_ZERO_LIB_NAME}" ) -if (WIN32) - # TODO: fix building level-zero loader on Windows - # target_compile_options(${LEVEL_ZERO_LIB_NAME} /UUNICODE) -else() + +if (NOT MSVC) target_compile_options(${LEVEL_ZERO_LIB_NAME} PUBLIC -Wno-unused-but-set-variable -Wno-pedantic @@ -75,6 +82,7 @@ add_library(${TARGET_NAME} ${L0_DIR}/ur_level_zero_context.hpp ${L0_DIR}/ur_level_zero_device.hpp ${L0_DIR}/ur_level_zero_event.hpp + ${L0_DIR}/ur_level_zero_usm.hpp ${L0_DIR}/ur_level_zero_mem.hpp ${L0_DIR}/ur_level_zero_kernel.hpp ${L0_DIR}/ur_level_zero_platform.hpp @@ -86,6 +94,7 @@ add_library(${TARGET_NAME} ${L0_DIR}/ur_level_zero_context.cpp ${L0_DIR}/ur_level_zero_device.cpp ${L0_DIR}/ur_level_zero_event.cpp + ${L0_DIR}/ur_level_zero_usm.cpp ${L0_DIR}/ur_level_zero_mem.cpp ${L0_DIR}/ur_level_zero_kernel.cpp ${L0_DIR}/ur_level_zero_platform.cpp @@ -93,6 +102,9 @@ add_library(${TARGET_NAME} ${L0_DIR}/ur_level_zero_queue.cpp ${L0_DIR}/ur_level_zero_sampler.cpp ${L0_DIR}/../../ur.cpp + ${L0_DIR}/../../usm_allocator.cpp + ${L0_DIR}/../../usm_allocator.hpp + ${L0_DIR}/../../usm_allocator_config.cpp ${L0_DIR}/../../usm_allocator_config.hpp ) @@ -104,6 +116,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + LevelZeroLoader LevelZeroLoader-Headers ) diff --git a/test/loader/platforms/CMakeLists.txt b/test/loader/platforms/CMakeLists.txt index de1fa76cfd..8db4fbb81c 100644 --- a/test/loader/platforms/CMakeLists.txt +++ b/test/loader/platforms/CMakeLists.txt @@ -31,5 +31,5 @@ function(add_loader_platform_test name ENV) ) endfunction() -add_loader_platform_test(no_platforms "") +add_loader_platform_test(no_platforms "UR_ADAPTERS_FORCE_LOAD=\"\"") add_loader_platform_test(null_platform "UR_ADAPTERS_FORCE_LOAD=\"$\"") diff --git a/test/loader/platforms/platforms.cpp b/test/loader/platforms/platforms.cpp index f230025b26..2eaea35be9 100644 --- a/test/loader/platforms/platforms.cpp +++ b/test/loader/platforms/platforms.cpp @@ -17,32 +17,32 @@ using namespace logger; ////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { - logger::init("TEST"); + auto out = create_logger("TEST"); ur_result_t status; // Initialize the platform status = urInit(0); if (status != UR_RESULT_SUCCESS) { - error("urInit failed with return code: {}", status); + out.error("urInit failed with return code: {}", status); return 1; } - info("urInit succeeded."); + out.info("urInit succeeded."); uint32_t platformCount = 0; std::vector platforms; status = urPlatformGet(1, nullptr, &platformCount); if (status != UR_RESULT_SUCCESS) { - error("urPlatformGet failed with return code: {}", status); + out.error("urPlatformGet failed with return code: {}", status); goto out; } - info("urPlatformGet found {} platforms", platformCount); + out.info("urPlatformGet found {} platforms", platformCount); platforms.resize(platformCount); status = urPlatformGet(platformCount, platforms.data(), nullptr); if (status != UR_RESULT_SUCCESS) { - error("urPlatformGet failed with return code: {}", status); + out.error("urPlatformGet failed with return code: {}", status); goto out; } @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) { status = urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, 0, nullptr, &name_len); if (status != UR_RESULT_SUCCESS) { - error("urPlatformGetInfo failed with return code: {}", status); + out.error("urPlatformGetInfo failed with return code: {}", status); goto out; } @@ -61,11 +61,11 @@ int main(int argc, char *argv[]) { status = urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, name_len, name, nullptr); if (status != UR_RESULT_SUCCESS) { - error("urPlatformGetInfo failed with return code: {}", status); + out.error("urPlatformGetInfo failed with return code: {}", status); free(name); goto out; } - info("Found {} ", name); + out.info("Found {} ", name); free(name); } From 09994ffab02df637d01718eedfa1ea15f17c1a90 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Thu, 29 Jun 2023 13:34:45 +0200 Subject: [PATCH 03/16] update level-zero --- cmake/helpers.cmake | 6 ++++-- source/adapters/level_zero/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index d0963d36c8..a4a3a70e78 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -60,7 +60,7 @@ endmacro() include(FetchContent) function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST) -message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}") + message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}") IF(NOT EXISTS ${DEST}) file(MAKE_DIRECTORY ${DEST}) execute_process(COMMAND git init -b main @@ -71,7 +71,9 @@ message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_T WORKING_DIRECTORY ${DEST}) file(APPEND ${DEST}/.git/info/sparse-checkout ${GIT_DIR}/) endif() - execute_process(COMMAND git pull --depth=1 origin ${GIT_TAG} + execute_process(COMMAND git fetch --depth=1 origin refs/tags/${GIT_TAG}:refs/tags/${GIT_TAG} + WORKING_DIRECTORY ${DEST}) + execute_process(COMMAND git checkout --quiet ${GIT_TAG} WORKING_DIRECTORY ${DEST}) endfunction() diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index cde6148dc9..6bfd35084d 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -7,7 +7,7 @@ set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero") # fetch xpti proxy library for the tracing layer -FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230601 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR}) +FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR}) set(TARGET_NAME ur_adapter_level_zero) From ed6e5b4a61103be96f4db3e8d3fb0fa912c5a183 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Thu, 29 Jun 2023 14:31:51 +0200 Subject: [PATCH 04/16] add cuda adapter --- CMakeLists.txt | 1 + README.md | 1 + ...move-sycl-namespaces-from-ur-adapter.patch | 856 ++++++++++++++++++ source/adapters/CMakeLists.txt | 13 + source/adapters/cuda/CMakeLists.txt | 78 ++ source/adapters/level_zero/CMakeLists.txt | 6 +- 6 files changed, 950 insertions(+), 5 deletions(-) create mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch create mode 100644 source/adapters/cuda/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b37322a26..bedb617d8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) option(UR_BUILD_TOOLS "build ur tools" ON) option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF) +option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/README.md b/README.md index d292b59012..e9ccf4a3a3 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ List of options provided by CMake: | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF | | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF | +| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF | **General**: diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch new file mode 100644 index 0000000000..e7214e94b2 --- /dev/null +++ b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch @@ -0,0 +1,856 @@ +From e30434a4a9e893f24e0bb18051576f297d1f4f08 Mon Sep 17 00:00:00 2001 +From: pbalcer +Date: Thu, 29 Jun 2023 14:26:26 +0200 +Subject: [PATCH] [SYCL][CUDA] remove sycl namespaces from ur adapter + +--- + .../ur/adapters/cuda/common.cpp | 6 +- + .../ur/adapters/cuda/common.hpp | 5 - + .../ur/adapters/cuda/context.cpp | 2 +- + .../ur/adapters/cuda/device.cpp | 170 +++++++++--------- + .../ur/adapters/cuda/enqueue.cpp | 2 +- + .../ur/adapters/cuda/event.cpp | 12 +- + .../ur/adapters/cuda/kernel.cpp | 26 +-- + .../ur/adapters/cuda/memory.cpp | 4 +- + .../ur/adapters/cuda/queue.cpp | 2 +- + .../ur/adapters/cuda/sampler.cpp | 2 +- + 10 files changed, 113 insertions(+), 118 deletions(-) + +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +index 86975e509..83264160e 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +@@ -72,17 +72,17 @@ std::string getCudaVersionString() { + return stream.str(); + } + +-void sycl::detail::ur::die(const char *Message) { ++void detail::ur::die(const char *Message) { + std::cerr << "ur_die: " << Message << std::endl; + std::terminate(); + } + +-void sycl::detail::ur::assertion(bool Condition, const char *Message) { ++void detail::ur::assertion(bool Condition, const char *Message) { + if (!Condition) + die(Message); + } + +-void sycl::detail::ur::cuPrint(const char *Message) { ++void detail::ur::cuPrint(const char *Message) { + std::cerr << "ur_print: " << Message << std::endl; + } + +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +index 5cfa60901..82b38c10d 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +@@ -8,7 +8,6 @@ + #pragma once + + #include +-#include + #include + + ur_result_t mapErrorUR(CUresult Result); +@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize]; + ur_result_t ErrorCode); + + /// ------ Error handling, matching OpenCL plugin semantics. +-namespace sycl { +-__SYCL_INLINE_VER_NAMESPACE(_V1) { + namespace detail { + namespace ur { + +@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr); + + } // namespace ur + } // namespace detail +-} // __SYCL_INLINE_VER_NAMESPACE(_V1) +-} // namespace sycl +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +index 74a32bdac..2b621383d 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( + } + case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hContext->getDevice()->get()) == CUDA_SUCCESS); +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +index 24f9d52a0..c6b6bc07e 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +@@ -15,7 +15,7 @@ + + int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { + int value; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS); + return value; + } +@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { + int ComputeUnits = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&ComputeUnits, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(ComputeUnits >= 0); ++ detail::ur::assertion(ComputeUnits >= 0); + return ReturnValue(static_cast(ComputeUnits)); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { +@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } ReturnSizes; + + int MaxX = 0, MaxY = 0, MaxZ = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxX >= 0); ++ detail::ur::assertion(MaxX >= 0); + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxY >= 0); ++ detail::ur::assertion(MaxY >= 0); + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxZ >= 0); ++ detail::ur::assertion(MaxZ >= 0); + + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); +@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + size_t Sizes[MaxWorkItemDimensions]; + } ReturnSizes; + int MaxX = 0, MaxY = 0, MaxZ = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxX >= 0); ++ detail::ur::assertion(MaxX >= 0); + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxY >= 0); ++ detail::ur::assertion(MaxY >= 0); + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(MaxZ >= 0); ++ detail::ur::assertion(MaxZ >= 0); + + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); +@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + + case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + int MaxWorkGroupSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxWorkGroupSize, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hDevice->get()) == CUDA_SUCCESS); + +- sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); ++ detail::ur::assertion(MaxWorkGroupSize >= 0); + + return ReturnValue(size_t(MaxWorkGroupSize)); + } +@@ -172,12 +172,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int MaxThreads = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxThreads, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hDevice->get()) == CUDA_SUCCESS); + int WarpSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; +@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + // Volta provides independent thread scheduling + // TODO: Revisit for previous generation GPUs + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + + case UR_DEVICE_INFO_ATOMIC_64: { + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_BFLOAT16: { + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -266,7 +266,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // NVIDIA devices only support one sub-group size (the warp size) + int WarpSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + size_t Sizes[1] = {static_cast(WarpSize)}; +@@ -274,10 +274,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { + int ClockFreq = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(ClockFreq >= 0); ++ detail::ur::assertion(ClockFreq >= 0); + return ReturnValue(static_cast(ClockFreq) / 1000u); + } + case UR_DEVICE_INFO_ADDRESS_BITS: { +@@ -292,7 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + // CL_DEVICE_TYPE_CUSTOM. + + size_t Global = 0; +- sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == ++ detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == + CUDA_SUCCESS); + + auto QuarterGlobal = static_cast(Global / 4u); +@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { + Enabled = true; + } else { +- sycl::detail::ur::cuPrint( ++ detail::ur::cuPrint( + "Images are not fully supported by the CUDA BE, their support is " + "disabled by default. Their partial support can be activated by " + "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " +@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int TexHeight = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexHeight, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexHeight >= 0); ++ detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfHeight, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfHeight >= 0); ++ detail::ur::assertion(SurfHeight >= 0); + + int Min = std::min(TexHeight, SurfHeight); + +@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexWidth >= 0); ++ detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfWidth >= 0); ++ detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + +@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int TexHeight = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexHeight, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexHeight >= 0); ++ detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfHeight, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfHeight >= 0); ++ detail::ur::assertion(SurfHeight >= 0); + + int Min = std::min(TexHeight, SurfHeight); + +@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexWidth >= 0); ++ detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfWidth >= 0); ++ detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + +@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { + // Take the smaller of maximum surface and maximum texture depth. + int TexDepth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexDepth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexDepth >= 0); ++ detail::ur::assertion(TexDepth >= 0); + int SurfDepth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfDepth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfDepth >= 0); ++ detail::ur::assertion(SurfDepth >= 0); + + int Min = std::min(TexDepth, SurfDepth); + +@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&TexWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(TexWidth >= 0); ++ detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&SurfWidth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(SurfWidth >= 0); ++ detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + +@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { + int MemBaseAddrAlign = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MemBaseAddrAlign, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, + hDevice->get()) == CUDA_SUCCESS); +@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { + int CacheSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(CacheSize >= 0); ++ detail::ur::assertion(CacheSize >= 0); + // The L2 cache is global to the GPU. + return ReturnValue(static_cast(CacheSize)); + } + case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { + size_t Bytes = 0; + // Runtime API has easy access to this value, driver API info is scarse. +- sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == ++ detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == + CUDA_SUCCESS); + return ReturnValue(uint64_t{Bytes}); + } + case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { + int ConstantMemory = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&ConstantMemory, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(ConstantMemory >= 0); ++ detail::ur::assertion(ConstantMemory >= 0); + + return ReturnValue(static_cast(ConstantMemory)); + } +@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + // CUDA has its own definition of "local memory", which maps to OpenCL's + // "private memory". + int LocalMemSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&LocalMemSize, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(LocalMemSize >= 0); ++ detail::ur::assertion(LocalMemSize >= 0); + return ReturnValue(static_cast(LocalMemSize)); + } + case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { + int ECCEnabled = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, + hDevice->get()) == CUDA_SUCCESS); + +- sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); ++ detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); + auto Result = static_cast(ECCEnabled); + return ReturnValue(Result); + } + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { + int IsIntegrated = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, + hDevice->get()) == CUDA_SUCCESS); + +- sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); ++ detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + auto result = static_cast(IsIntegrated); + return ReturnValue(result); + } +@@ -620,7 +620,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_NAME: { + static constexpr size_t MaxDeviceNameLength = 256u; + char Name[MaxDeviceNameLength]; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == + CUDA_SUCCESS); + return ReturnValue(Name, strlen(Name) + 1); +@@ -641,13 +641,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_VERSION: { + std::stringstream SS; + int Major; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); + SS << Major; + int Minor; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -666,11 +666,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + int Major = 0; + int Minor = 0; + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -847,27 +847,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { + size_t FreeMemory = 0; + size_t TotalMemory = 0; +- sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == ++ detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == + CUDA_SUCCESS, + "failed cuMemGetInfo() API."); + return ReturnValue(FreeMemory); + } + case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { + int Value = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(Value >= 0); ++ detail::ur::assertion(Value >= 0); + // Convert kilohertz to megahertz when returning. + return ReturnValue(Value / 1000); + } + case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { + int Value = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Value, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(Value >= 0); ++ detail::ur::assertion(Value >= 0); + return ReturnValue(Value); + } + case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { +@@ -875,10 +875,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_DEVICE_ID: { + int Value = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion(Value >= 0); ++ detail::ur::assertion(Value >= 0); + return ReturnValue(Value); + } + case UR_DEVICE_INFO_UUID: { +@@ -888,10 +888,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + int Minor = DriverVersion % 1000 / 10; + CUuuid UUID; + if ((Major > 11) || (Major == 11 && Minor >= 4)) { +- sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == ++ detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == + CUDA_SUCCESS); + } else { +- sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == ++ detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == + CUDA_SUCCESS); + } + std::array Name; +@@ -900,13 +900,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { + int Major = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hDevice->get()) == CUDA_SUCCESS); + + int Minor = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&Minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + hDevice->get()) == CUDA_SUCCESS); +@@ -922,7 +922,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } else if (IsOrinAGX) { + MemoryClockKHz = 3200000; + } else { +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MemoryClockKHz, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + hDevice->get()) == CUDA_SUCCESS); +@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + if (IsOrinAGX) { + MemoryBusWidth = 256; + } else { +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MemoryBusWidth, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + hDevice->get()) == CUDA_SUCCESS); +@@ -977,7 +977,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + hDevice->get())); + +- sycl::detail::ur::assertion(MaxRegisters >= 0); ++ detail::ur::assertion(MaxRegisters >= 0); + + return ReturnValue(static_cast(MaxRegisters)); + } +@@ -988,11 +988,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + case UR_DEVICE_INFO_PCI_ADDRESS: { + constexpr size_t AddressBufferSize = 13; + char AddressBuffer[AddressBufferSize]; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == + CUDA_SUCCESS); + // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written +- sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == ++ detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == + 12); + return ReturnValue(AddressBuffer, + strnlen(AddressBuffer, AddressBufferSize - 1) + 1); +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +index 52c4c3895..55c56aee2 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +@@ -806,7 +806,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { + case CU_AD_FORMAT_FLOAT: + return 4; + default: +- sycl::detail::ur::die("Invalid image format."); ++ detail::ur::die("Invalid image format."); + return 0; + } + } +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +index 8916197b7..9d86189b9 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() { + try { + EventID = Queue->getNextEventID(); + if (EventID == 0) { +- sycl::detail::ur::die( ++ detail::ur::die( + "Unrecoverable program state reached in event identifier overflow"); + } + Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); +@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, + case UR_EVENT_INFO_CONTEXT: + return ReturnValue(hEvent->getContext()); + default: +- sycl::detail::ur::die("Event info request not implemented"); ++ detail::ur::die("Event info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( + default: + break; + } +- sycl::detail::ur::die("Event Profiling info request not implemented"); ++ detail::ur::die("Event Profiling info request not implemented"); + return {}; + } + +@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, + ur_execution_info_t, + ur_event_callback_t, + void *) { +- sycl::detail::ur::die("Event Callback not implemented in CUDA adapter"); ++ detail::ur::die("Event Callback not implemented in CUDA adapter"); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -254,7 +254,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + + const auto RefCount = hEvent->incrementReferenceCount(); + +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + RefCount != 0, "Reference count overflow detected in urEventRetain."); + + return UR_RESULT_SUCCESS; +@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + hEvent->getReferenceCount() != 0, + "Reference count overflow detected in urEventRelease."); + +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +index 358f59c49..cae080401 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + size_t GlobalWorkSize[3] = {0, 0, 0}; + + int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + + int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + +@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + } + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int MaxThreads = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(size_t(MaxThreads)); +@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int Bytes = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(uint64_t(Bytes)); +@@ -130,7 +130,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int WarpSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + return ReturnValue(static_cast(WarpSize)); +@@ -138,7 +138,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int Bytes = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(uint64_t(Bytes)); +@@ -231,7 +231,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, + return ReturnValue(""); + case UR_KERNEL_INFO_NUM_REGS: { + int NumRegs = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(static_cast(NumRegs)); +@@ -254,7 +254,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { + // Sub-group size is equivalent to warp size + int WarpSize = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + return ReturnValue(static_cast(WarpSize)); +@@ -262,7 +262,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int MaxThreads = 0; +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == CUDA_SUCCESS); + int WarpSize = 0; +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +index b19acea31..ecf840330 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +@@ -162,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + // error for which it is unclear if the function that reported it succeeded + // or not. Either way, the state of the program is compromised and likely + // unrecoverable. +- sycl::detail::ur::die( ++ detail::ur::die( + "Unrecoverable program state reached in urMemRelease"); + } + +@@ -331,7 +331,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + PixelTypeSizeBytes = 4; + break; + default: +- sycl::detail::ur::die( ++ detail::ur::die( + "urMemImageCreate given unsupported image_channel_data_type"); + } + +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +index 05443eeed..32391fec5 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( + else if (CuFlags == CU_STREAM_NON_BLOCKING) + Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; + else +- sycl::detail::ur::die("Unknown cuda stream"); ++ detail::ur::die("Unknown cuda stream"); + + std::vector ComputeCuStreams(1, CuStream); + std::vector TransferCuStreams(0); +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +index 36ec89fb9..836e47f98 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) { + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. +- sycl::detail::ur::assertion( ++ detail::ur::assertion( + hSampler->getReferenceCount() != 0, + "Reference count overflow detected in urSamplerRelease."); + +-- +2.41.0 + diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 1d77f91a55..fef3d99888 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -5,6 +5,19 @@ add_subdirectory(null) + +if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) + # fetch adapter sources from SYCL + set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") + FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) + execute_process(COMMAND git apply ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch + WORKING_DIRECTORY ${SYCL_ADAPTER_DIR}) +endif() + if(UR_BUILD_ADAPTER_L0) add_subdirectory(level_zero) endif() + +if(UR_BUILD_ADAPTER_CUDA) +add_subdirectory(cuda) +endif() diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt new file mode 100644 index 0000000000..d0bc3fd6b2 --- /dev/null +++ b/source/adapters/cuda/CMakeLists.txt @@ -0,0 +1,78 @@ +# Copyright (C) 2022 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda") + +set(TARGET_NAME ur_adapter_cuda) + +add_library(${TARGET_NAME} + SHARED + ${CUDA_DIR}/ur_interface_loader.cpp + ${CUDA_DIR}/common.hpp + ${CUDA_DIR}/common.cpp + ${CUDA_DIR}/context.hpp + ${CUDA_DIR}/context.cpp + ${CUDA_DIR}/device.hpp + ${CUDA_DIR}/device.cpp + ${CUDA_DIR}/enqueue.cpp + ${CUDA_DIR}/event.hpp + ${CUDA_DIR}/event.cpp + ${CUDA_DIR}/kernel.hpp + ${CUDA_DIR}/kernel.cpp + ${CUDA_DIR}/memory.hpp + ${CUDA_DIR}/memory.cpp + ${CUDA_DIR}/platform.hpp + ${CUDA_DIR}/platform.cpp + ${CUDA_DIR}/program.hpp + ${CUDA_DIR}/program.cpp + ${CUDA_DIR}/queue.hpp + ${CUDA_DIR}/queue.cpp + ${CUDA_DIR}/sampler.hpp + ${CUDA_DIR}/sampler.cpp + ${CUDA_DIR}/tracing.cpp + ${CUDA_DIR}/usm.cpp + ${CUDA_DIR}/../../ur.cpp + ${CUDA_DIR}/../../ur.hpp + ${CUDA_DIR}/../../usm_allocator.cpp + ${CUDA_DIR}/../../usm_allocator.hpp + ${CUDA_DIR}/../../usm_allocator_config.cpp + ${CUDA_DIR}/../../usm_allocator_config.hpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +find_package(Threads REQUIRED) +find_package(CUDA 10.1 REQUIRED) + +# Make imported library global to use it within the project. +add_library(cudadrv SHARED IMPORTED GLOBAL) + +if (WIN32) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_IMPLIB ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} + ) +else() + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} + ) +endif() + +target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + Threads::Threads + cudadrv +) + +target_include_directories(${TARGET_NAME} PRIVATE + ${CUDA_DIR}/../../../ +) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 6bfd35084d..e1d483cdd4 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -3,11 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -set(SYCL_L0_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") -set(L0_DIR "${SYCL_L0_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero") - -# fetch xpti proxy library for the tracing layer -FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_L0_DIR}) +set(L0_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/level_zero") set(TARGET_NAME ur_adapter_level_zero) From 44e5ed2a526ef113bba4b391b4be36669a312264 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Wed, 5 Jul 2023 13:38:25 +0200 Subject: [PATCH 05/16] suppress git warnings about applying cuda patch twice --- source/adapters/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index fef3d99888..d9d4d29156 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -10,7 +10,8 @@ if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) - execute_process(COMMAND git apply ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch + + execute_process(COMMAND git apply --quiet ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch WORKING_DIRECTORY ${SYCL_ADAPTER_DIR}) endif() From 2b50550aee0e2a8a5319a6159bbc2705328e372b Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Wed, 12 Jul 2023 11:57:37 +0100 Subject: [PATCH 06/16] [UR] update adapter to use newer sycl plugin version --- cmake/helpers.cmake | 4 +- ...move-sycl-namespaces-from-ur-adapter.patch | 390 +++++++++++++----- source/adapters/CMakeLists.txt | 9 +- 3 files changed, 301 insertions(+), 102 deletions(-) diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index a4a3a70e78..0892fecdb4 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -63,7 +63,9 @@ function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST) message(STATUS "Fetching sparse source ${GIT_DIR} from ${GIT_REPOSITORY} ${GIT_TAG}") IF(NOT EXISTS ${DEST}) file(MAKE_DIRECTORY ${DEST}) - execute_process(COMMAND git init -b main + execute_process(COMMAND git init + WORKING_DIRECTORY ${DEST}) + execute_process(COMMAND git checkout -b main WORKING_DIRECTORY ${DEST}) execute_process(COMMAND git remote add origin ${GIT_REPOSITORY} WORKING_DIRECTORY ${DEST}) diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch index e7214e94b2..8153e1cb85 100644 --- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch +++ b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch @@ -1,23 +1,174 @@ -From e30434a4a9e893f24e0bb18051576f297d1f4f08 Mon Sep 17 00:00:00 2001 +From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001 From: pbalcer -Date: Thu, 29 Jun 2023 14:26:26 +0200 -Subject: [PATCH] [SYCL][CUDA] remove sycl namespaces from ur adapter +Date: Thu, 29 Jun 2023 15:11:43 +0200 +Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter +This was preventing out-of-tree build of the adapter for standalone +use with unified runtime. + +Signed-off-by: Piotr Balcer --- + .../ur/adapters/cuda/command_buffer.cpp | 52 ++--- .../ur/adapters/cuda/common.cpp | 6 +- .../ur/adapters/cuda/common.hpp | 5 - .../ur/adapters/cuda/context.cpp | 2 +- - .../ur/adapters/cuda/device.cpp | 170 +++++++++--------- + .../ur/adapters/cuda/device.cpp | 209 +++++++++--------- .../ur/adapters/cuda/enqueue.cpp | 2 +- - .../ur/adapters/cuda/event.cpp | 12 +- - .../ur/adapters/cuda/kernel.cpp | 26 +-- - .../ur/adapters/cuda/memory.cpp | 4 +- + .../ur/adapters/cuda/event.cpp | 17 +- + .../ur/adapters/cuda/kernel.cpp | 42 ++-- + .../ur/adapters/cuda/memory.cpp | 5 +- .../ur/adapters/cuda/queue.cpp | 2 +- .../ur/adapters/cuda/sampler.cpp | 2 +- - 10 files changed, 113 insertions(+), 118 deletions(-) + 11 files changed, 167 insertions(+), 177 deletions(-) +diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp +index c83e9e732303..57956cb64a67 100644 +--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp ++++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp +@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( + (void)hDevice; + (void)pCommandBufferDesc; + (void)phCommandBuffer; +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL + urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL + urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL + urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( + (void)pSyncPointWaitList; + (void)pSyncPoint; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + +@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + (void)phEventWaitList; + (void)phEvent; + +- sycl::detail::ur::die("Experimental Command-buffer feature is not " +- "implemented for CUDA adapter."); ++ detail::ur::die("Experimental Command-buffer feature is not " ++ "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -index 86975e509..83264160e 100644 +index 86975e509725..83264160e700 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -72,17 +72,17 @@ std::string getCudaVersionString() { @@ -42,7 +193,7 @@ index 86975e509..83264160e 100644 } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -index 5cfa60901..82b38c10d 100644 +index 5cfa609018b2..82b38c10d449 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp @@ -8,7 +8,6 @@ @@ -69,7 +220,7 @@ index 5cfa60901..82b38c10d 100644 -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -index 74a32bdac..2b621383d 100644 +index 74a32bdac274..2b621383da09 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp @@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( @@ -82,7 +233,7 @@ index 74a32bdac..2b621383d 100644 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hContext->getDevice()->get()) == CUDA_SUCCESS); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -index 24f9d52a0..c6b6bc07e 100644 +index 52d4e3badc8f..a81599d629a7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -15,7 +15,7 @@ @@ -177,7 +328,7 @@ index 24f9d52a0..c6b6bc07e 100644 return ReturnValue(size_t(MaxWorkGroupSize)); } -@@ -172,12 +172,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder int MaxThreads = 0; @@ -188,10 +339,14 @@ index 24f9d52a0..c6b6bc07e 100644 hDevice->get()) == CUDA_SUCCESS); int WarpSize = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, ++ CU_DEVICE_ATTRIBUTE_WARP_SIZE, ++ hDevice->get()) == CUDA_SUCCESS); int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(MaxWarps); + } @@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // Volta provides independent thread scheduling // TODO: Revisit for previous generation GPUs @@ -228,37 +383,43 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get()) == CUDA_SUCCESS); -@@ -266,7 +266,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // NVIDIA devices only support one sub-group size (the warp size) int WarpSize = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, ++ CU_DEVICE_ATTRIBUTE_WARP_SIZE, ++ hDevice->get()) == CUDA_SUCCESS); size_t Sizes[1] = {static_cast(WarpSize)}; -@@ -274,10 +274,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { int ClockFreq = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, +- hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(ClockFreq >= 0); ++ detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq, ++ CU_DEVICE_ATTRIBUTE_CLOCK_RATE, ++ hDevice->get()) == CUDA_SUCCESS); + detail::ur::assertion(ClockFreq >= 0); return ReturnValue(static_cast(ClockFreq) / 1000u); } case UR_DEVICE_INFO_ADDRESS_BITS: { -@@ -292,7 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CL_DEVICE_TYPE_CUSTOM. size_t Global = 0; - sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == +- CUDA_SUCCESS); + detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == - CUDA_SUCCESS); ++ CUDA_SUCCESS); auto QuarterGlobal = static_cast(Global / 4u); + @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { Enabled = true; @@ -426,8 +587,9 @@ index 24f9d52a0..c6b6bc07e 100644 size_t Bytes = 0; // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == +- CUDA_SUCCESS); + detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == - CUDA_SUCCESS); ++ CUDA_SUCCESS); return ReturnValue(uint64_t{Bytes}); } case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { @@ -458,9 +620,11 @@ index 24f9d52a0..c6b6bc07e 100644 case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { int ECCEnabled = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled, ++ CU_DEVICE_ATTRIBUTE_ECC_ENABLED, ++ hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); + detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); @@ -470,25 +634,30 @@ index 24f9d52a0..c6b6bc07e 100644 case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { int IsIntegrated = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated, ++ CU_DEVICE_ATTRIBUTE_INTEGRATED, ++ hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); auto result = static_cast(IsIntegrated); return ReturnValue(result); } -@@ -620,7 +620,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_NAME: { static constexpr size_t MaxDeviceNameLength = 256u; char Name[MaxDeviceNameLength]; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == - CUDA_SUCCESS); +- cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == +- CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength, ++ hDevice->get()) == CUDA_SUCCESS); return ReturnValue(Name, strlen(Name) + 1); -@@ -641,13 +641,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + } + case UR_DEVICE_INFO_VENDOR: { +@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VERSION: { std::stringstream SS; int Major; @@ -504,7 +673,7 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get()) == CUDA_SUCCESS); -@@ -666,11 +666,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, int Major = 0; int Minor = 0; @@ -518,14 +687,16 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get()) == CUDA_SUCCESS); -@@ -847,27 +847,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { size_t FreeMemory = 0; size_t TotalMemory = 0; - sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == +- CUDA_SUCCESS, +- "failed cuMemGetInfo() API."); + detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == - CUDA_SUCCESS, - "failed cuMemGetInfo() API."); ++ CUDA_SUCCESS, ++ "failed cuMemGetInfo() API."); return ReturnValue(FreeMemory); } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { @@ -551,7 +722,7 @@ index 24f9d52a0..c6b6bc07e 100644 return ReturnValue(Value); } case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { -@@ -875,10 +875,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; @@ -564,20 +735,21 @@ index 24f9d52a0..c6b6bc07e 100644 return ReturnValue(Value); } case UR_DEVICE_INFO_UUID: { -@@ -888,10 +888,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - int Minor = DriverVersion % 1000 / 10; CUuuid UUID; - if ((Major > 11) || (Major == 11 && Minor >= 4)) { -- sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == -+ detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == - CUDA_SUCCESS); - } else { -- sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == -+ detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == - CUDA_SUCCESS); - } + #if (CUDA_VERSION >= 11040) +- sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == +- CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == ++ CUDA_SUCCESS); + #else +- sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == +- CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == ++ CUDA_SUCCESS); + #endif std::array Name; -@@ -900,13 +900,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); +@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { int Major = 0; @@ -593,7 +765,7 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get()) == CUDA_SUCCESS); -@@ -922,7 +922,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } else if (IsOrinAGX) { MemoryClockKHz = 3200000; } else { @@ -602,7 +774,7 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&MemoryClockKHz, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, hDevice->get()) == CUDA_SUCCESS); -@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, if (IsOrinAGX) { MemoryBusWidth = 256; } else { @@ -611,7 +783,7 @@ index 24f9d52a0..c6b6bc07e 100644 cuDeviceGetAttribute(&MemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, hDevice->get()) == CUDA_SUCCESS); -@@ -977,7 +977,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, hDevice->get())); @@ -620,25 +792,27 @@ index 24f9d52a0..c6b6bc07e 100644 return ReturnValue(static_cast(MaxRegisters)); } -@@ -988,11 +988,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, +@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PCI_ADDRESS: { constexpr size_t AddressBufferSize = 13; char AddressBuffer[AddressBufferSize]; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == - CUDA_SUCCESS); +- cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == +- CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, ++ hDevice->get()) == CUDA_SUCCESS); // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written - sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == -+ detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == - 12); +- 12); ++ detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); return ReturnValue(AddressBuffer, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); + } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -index 52c4c3895..55c56aee2 100644 +index 1cfc5cc40a4a..792f69092682 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -@@ -806,7 +806,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { +@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { case CU_AD_FORMAT_FLOAT: return 4; default: @@ -648,7 +822,7 @@ index 52c4c3895..55c56aee2 100644 } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -index 8916197b7..9d86189b9 100644 +index 8916197b73f1..066c0498f1d0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp @@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() { @@ -687,26 +861,31 @@ index 8916197b7..9d86189b9 100644 return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -@@ -254,7 +254,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { +@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { const auto RefCount = hEvent->incrementReferenceCount(); - sycl::detail::ur::assertion( -+ detail::ur::assertion( - RefCount != 0, "Reference count overflow detected in urEventRetain."); +- RefCount != 0, "Reference count overflow detected in urEventRetain."); ++ detail::ur::assertion(RefCount != 0, ++ "Reference count overflow detected in urEventRetain."); return UR_RESULT_SUCCESS; -@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + } +@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - sycl::detail::ur::assertion( -+ detail::ur::assertion( - hEvent->getReferenceCount() != 0, - "Reference count overflow detected in urEventRelease."); +- hEvent->getReferenceCount() != 0, +- "Reference count overflow detected in urEventRelease."); ++ detail::ur::assertion(hEvent->getReferenceCount() != 0, ++ "Reference count overflow detected in urEventRelease."); + // decrement ref count. If it is 0, delete the event. + if (hEvent->decrementReferenceCount() == 0) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -index 358f59c49..cae080401 100644 +index 358f59c499e1..7d46ce039bab 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, @@ -758,43 +937,55 @@ index 358f59c49..cae080401 100644 cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get()) == CUDA_SUCCESS); return ReturnValue(uint64_t(Bytes)); -@@ -130,7 +130,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, +@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { // Work groups should be multiples of the warp size int WarpSize = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, ++ CU_DEVICE_ATTRIBUTE_WARP_SIZE, ++ hDevice->get()) == CUDA_SUCCESS); return ReturnValue(static_cast(WarpSize)); -@@ -138,7 +138,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + } case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { // OpenCL PRIVATE == CUDA LOCAL int Bytes = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - hKernel->get()) == CUDA_SUCCESS); +- cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, +- hKernel->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuFuncGetAttribute(&Bytes, ++ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, ++ hKernel->get()) == CUDA_SUCCESS); return ReturnValue(uint64_t(Bytes)); -@@ -231,7 +231,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, + } + default: +@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, return ReturnValue(""); case UR_KERNEL_INFO_NUM_REGS: { int NumRegs = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - hKernel->get()) == CUDA_SUCCESS); +- cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, +- hKernel->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuFuncGetAttribute(&NumRegs, ++ CU_FUNC_ATTRIBUTE_NUM_REGS, ++ hKernel->get()) == CUDA_SUCCESS); return ReturnValue(static_cast(NumRegs)); -@@ -254,7 +254,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + } + default: +@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { // Sub-group size is equivalent to warp size int WarpSize = 0; - sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - hDevice->get()) == CUDA_SUCCESS); +- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, +- hDevice->get()) == CUDA_SUCCESS); ++ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, ++ CU_DEVICE_ATTRIBUTE_WARP_SIZE, ++ hDevice->get()) == CUDA_SUCCESS); return ReturnValue(static_cast(WarpSize)); -@@ -262,7 +262,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + } case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder int MaxThreads = 0; @@ -804,19 +995,20 @@ index 358f59c49..cae080401 100644 hKernel->get()) == CUDA_SUCCESS); int WarpSize = 0; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -index b19acea31..ecf840330 100644 +index b19acea3159f..f0c276579476 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -@@ -162,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { +@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { // error for which it is unclear if the function that reported it succeeded // or not. Either way, the state of the program is compromised and likely // unrecoverable. - sycl::detail::ur::die( -+ detail::ur::die( - "Unrecoverable program state reached in urMemRelease"); +- "Unrecoverable program state reached in urMemRelease"); ++ detail::ur::die("Unrecoverable program state reached in urMemRelease"); } -@@ -331,7 +331,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + return UR_RESULT_SUCCESS; +@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( PixelTypeSizeBytes = 4; break; default: @@ -826,7 +1018,7 @@ index b19acea31..ecf840330 100644 } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -index 05443eeed..32391fec5 100644 +index 05443eeed89d..32391fec5c13 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( @@ -839,7 +1031,7 @@ index 05443eeed..32391fec5 100644 std::vector ComputeCuStreams(1, CuStream); std::vector TransferCuStreams(0); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -index 36ec89fb9..836e47f98 100644 +index 36ec89fb9da3..836e47f988e5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp @@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) { diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index d9d4d29156..9a817f4abe 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -9,9 +9,14 @@ add_subdirectory(null) if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") - FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230628 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) + FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) - execute_process(COMMAND git apply --quiet ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch + get_program_version_major_minor(git GIT_VERSION) + set(GIT_QUIET_OPTION "") + if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0") + set(GIT_QUIET_OPTION "--quiet") + endif() + execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch WORKING_DIRECTORY ${SYCL_ADAPTER_DIR}) endif() From 7c2db62b18ea170fcf13fbca1cdc1ded92a81c11 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Wed, 12 Jul 2023 12:09:19 +0100 Subject: [PATCH 07/16] [UR] Fix L0 plugin file names --- source/adapters/level_zero/CMakeLists.txt | 46 +++++++++++------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index e1d483cdd4..e9b4aaa547 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -73,30 +73,30 @@ target_include_directories(LevelZeroLoader-Headers add_library(${TARGET_NAME} SHARED - ${L0_DIR}/ur_loader_interface.cpp - ${L0_DIR}/ur_level_zero_common.hpp - ${L0_DIR}/ur_level_zero_context.hpp - ${L0_DIR}/ur_level_zero_device.hpp - ${L0_DIR}/ur_level_zero_event.hpp - ${L0_DIR}/ur_level_zero_usm.hpp - ${L0_DIR}/ur_level_zero_mem.hpp - ${L0_DIR}/ur_level_zero_kernel.hpp - ${L0_DIR}/ur_level_zero_platform.hpp - ${L0_DIR}/ur_level_zero_program.hpp - ${L0_DIR}/ur_level_zero_queue.hpp - ${L0_DIR}/ur_level_zero_sampler.hpp + ${L0_DIR}/ur_interface_loader.cpp + ${L0_DIR}/common.hpp + ${L0_DIR}/context.hpp + ${L0_DIR}/device.hpp + ${L0_DIR}/event.hpp + ${L0_DIR}/usm.hpp + ${L0_DIR}/memory.hpp + ${L0_DIR}/kernel.hpp + ${L0_DIR}/platform.hpp + ${L0_DIR}/program.hpp + ${L0_DIR}/queue.hpp + ${L0_DIR}/sampler.hpp ${L0_DIR}/ur_level_zero.cpp - ${L0_DIR}/ur_level_zero_common.cpp - ${L0_DIR}/ur_level_zero_context.cpp - ${L0_DIR}/ur_level_zero_device.cpp - ${L0_DIR}/ur_level_zero_event.cpp - ${L0_DIR}/ur_level_zero_usm.cpp - ${L0_DIR}/ur_level_zero_mem.cpp - ${L0_DIR}/ur_level_zero_kernel.cpp - ${L0_DIR}/ur_level_zero_platform.cpp - ${L0_DIR}/ur_level_zero_program.cpp - ${L0_DIR}/ur_level_zero_queue.cpp - ${L0_DIR}/ur_level_zero_sampler.cpp + ${L0_DIR}/common.cpp + ${L0_DIR}/context.cpp + ${L0_DIR}/device.cpp + ${L0_DIR}/event.cpp + ${L0_DIR}/usm.cpp + ${L0_DIR}/memory.cpp + ${L0_DIR}/kernel.cpp + ${L0_DIR}/platform.cpp + ${L0_DIR}/program.cpp + ${L0_DIR}/queue.cpp + ${L0_DIR}/sampler.cpp ${L0_DIR}/../../ur.cpp ${L0_DIR}/../../usm_allocator.cpp ${L0_DIR}/../../usm_allocator.hpp From 43f421691c8d7b2e5cb3466498e09428122ab885 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Wed, 12 Jul 2023 13:25:16 +0100 Subject: [PATCH 08/16] [UR] Bump L0 Loader version --- source/adapters/level_zero/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index e9b4aaa547..52d22c3ea6 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -11,7 +11,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) message(STATUS "Download Level Zero loader and headers from github.com") set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") - set(LEVEL_ZERO_LOADER_TAG v1.8.8) + set(LEVEL_ZERO_LOADER_TAG v1.11.0) # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 set(CMAKE_INCLUDE_CURRENT_DIR OFF) From 40c50865883f70d191b1cc891373fdc0d1a7be02 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 14 Jul 2023 15:14:55 +0100 Subject: [PATCH 09/16] [UR][HIP] Add ur hip target build to ur repo --- CMakeLists.txt | 1 + README.md | 2 + source/adapters/CMakeLists.txt | 12 ++- source/adapters/hip/CMakeLists.txt | 144 +++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 source/adapters/hip/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index bedb617d8c..42f42642a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace option(UR_BUILD_TOOLS "build ur tools" ON) option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF) option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF) +option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/README.md b/README.md index e9ccf4a3a3..a9a1e0cabb 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,8 @@ List of options provided by CMake: | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | | UR_BUILD_ADAPTER_L0 | Fetch and use level-zero adapter from SYCL | ON/OFF | OFF | | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF | +| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF | +| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | **General**: diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 9a817f4abe..57df48f95f 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -6,10 +6,10 @@ add_subdirectory(null) -if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) +if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") - FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230706 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) + FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) get_program_version_major_minor(git GIT_VERSION) set(GIT_QUIET_OPTION "") @@ -21,9 +21,13 @@ if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA) endif() if(UR_BUILD_ADAPTER_L0) -add_subdirectory(level_zero) + add_subdirectory(level_zero) endif() if(UR_BUILD_ADAPTER_CUDA) -add_subdirectory(cuda) + add_subdirectory(cuda) +endif() + +if(UR_BUILD_ADAPTER_HIP) + add_subdirectory(hip) endif() diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt new file mode 100644 index 0000000000..82ab485119 --- /dev/null +++ b/source/adapters/hip/CMakeLists.txt @@ -0,0 +1,144 @@ +# Copyright (C) 2022 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(HIP_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/hip") + +set(TARGET_NAME ur_adapter_hip) + +# Set default UR HIP platform to AMD +set(UR_HIP_PLATFORM "AMD" CACHE STRING "UR HIP platform, AMD or NVIDIA") + +# Set default ROCm installation directory +set(UR_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir") + +set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include") + +set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include") + +# Set HIP lib dir +set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib") + +# Check if HIP library path exists (AMD platform only) +if("${UR_HIP_PLATFORM}" STREQUAL "AMD") + if(NOT EXISTS "${UR_HIP_LIB_DIR}") + message(FATAL_ERROR "Couldn't find the HIP library directory at '${UR_HIP_LIB_DIR}'," + " please check ROCm installation.") + endif() + # Check if HIP include path exists + if(NOT EXISTS "${UR_HIP_INCLUDE_DIR}") + message(FATAL_ERROR "Couldn't find the HIP include directory at '${UR_HIP_INCLUDE_DIR}'," + " please check ROCm installation.") + endif() + + # Check if HSA include path exists + if(NOT EXISTS "${UR_HIP_HSA_INCLUDE_DIR}") + message(FATAL_ERROR "Couldn't find the HSA include directory at '${UR_HIP_HSA_INCLUDE_DIR}'," + " please check ROCm installation.") + endif() +endif() + +# Set includes used in added library (rocmdrv) +set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}") + +add_library(${TARGET_NAME} + SHARED + ${HIP_DIR}/ur_interface_loader.cpp + ${HIP_DIR}/common.hpp + ${HIP_DIR}/common.cpp + ${HIP_DIR}/context.hpp + ${HIP_DIR}/context.cpp + ${HIP_DIR}/device.hpp + ${HIP_DIR}/device.cpp + ${HIP_DIR}/enqueue.cpp + ${HIP_DIR}/event.hpp + ${HIP_DIR}/event.cpp + ${HIP_DIR}/kernel.hpp + ${HIP_DIR}/kernel.cpp + ${HIP_DIR}/memory.hpp + ${HIP_DIR}/memory.cpp + ${HIP_DIR}/platform.hpp + ${HIP_DIR}/platform.cpp + ${HIP_DIR}/program.hpp + ${HIP_DIR}/program.cpp + ${HIP_DIR}/queue.hpp + ${HIP_DIR}/queue.cpp + ${HIP_DIR}/sampler.hpp + ${HIP_DIR}/sampler.cpp + ${HIP_DIR}/usm.cpp + ${HIP_DIR}/../../ur.cpp + ${HIP_DIR}/../../ur.hpp + ${HIP_DIR}/../../usm_allocator.cpp + ${HIP_DIR}/../../usm_allocator.hpp + ${HIP_DIR}/../../usm_allocator_config.cpp + ${HIP_DIR}/../../usm_allocator_config.hpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +if("${UR_HIP_PLATFORM}" STREQUAL "AMD") + # Import HIP runtime library + add_library(rocmdrv SHARED IMPORTED GLOBAL) + + set_target_properties( + rocmdrv PROPERTIES + IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamdhip64.so" + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + + target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + rocmdrv + ) + + # Set HIP define to select AMD platform + target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_AMD__) +elseif("${UR_HIP_PLATFORM}" STREQUAL "NVIDIA") + # Import CUDA libraries + find_package(CUDA REQUIRED) + find_package(Threads REQUIRED) + + list(APPEND HIP_HEADERS ${CUDA_INCLUDE_DIRS}) + + # cudadrv may be defined by the CUDA plugin + if(NOT TARGET cudadrv) + add_library(cudadrv SHARED IMPORTED GLOBAL) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + endif() + + add_library(cudart SHARED IMPORTED GLOBAL) + set_target_properties( + cudart PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDART_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + + target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + Threads::Threads + cudadrv + cudart + ) + + # Set HIP define to select NVIDIA platform + target_compile_definitions(${TARGET_NAME} PRIVATE __HIP_PLATFORM_NVIDIA__) +else() + message(FATAL_ERROR "Unspecified UR HIP platform please set UR_HIP_PLATFORM to 'AMD' or 'NVIDIA'") +endif() + +target_include_directories(${TARGET_NAME} PRIVATE + ${HIP_DIR}/../../../ +) From 896a732d05944a6b8919a6743d5c04b90a9c95e1 Mon Sep 17 00:00:00 2001 From: pbalcer Date: Tue, 18 Jul 2023 11:01:03 +0200 Subject: [PATCH 10/16] update sycl adapters tag and add adapter CI --- .github/workflows/cmake.yml | 50 +- ...move-sycl-namespaces-from-ur-adapter.patch | 1048 ----------------- source/adapters/CMakeLists.txt | 10 +- source/adapters/hip/CMakeLists.txt | 6 + 4 files changed, 56 insertions(+), 1058 deletions(-) delete mode 100644 source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index ab3f7ec061..e9f4ba2594 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -66,7 +66,6 @@ jobs: -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DUR_BUILD_TESTS=ON -DUR_FORMAT_CPP_STYLE=ON - -DUR_BUILD_ADAPTER_L0=ON ${{matrix.libbacktrace}} - name: Generate source from spec, check for uncommitted diff @@ -80,6 +79,55 @@ jobs: working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "python|uma|loader|validation|tracing|unit|urtrace" + adapter-build: + name: Build - Adapters on Ubuntu + strategy: + matrix: + os: ['ubuntu-22.04'] + adapter: [CUDA, HIP, L0] + build_type: [Debug, Release] + compiler: [{c: gcc, cxx: g++}, {c: clang, cxx: clang++}] + + runs-on: ${{matrix.os}} + + steps: + - uses: actions/checkout@v3 + + - name: Install apt packages + run: | + sudo apt-get update + sudo apt-get install -y doxygen ${{matrix.compiler.c}} + + - name: Install pip packages + run: pip install -r third_party/requirements.txt + + - name: Install CUDA specific dependencies + if: matrix.adapter == 'CUDA' + run: | + sudo apt-get install nvidia-cuda-toolkit nvidia-cuda-toolkit-gcc + + - name: Install HIP specific dependencies + if: matrix.adapter == 'HIP' + run: | + wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/jammy/amdgpu-install_5.6.50600-1_all.deb + sudo apt install ./amdgpu-install_5.6.50600-1_all.deb + sudo amdgpu-install --usecase=rocm + + - name: Configure CMake + run: > + cmake + -B${{github.workspace}}/build + -DCMAKE_C_COMPILER=${{matrix.compiler.c}} + -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + -DUR_ENABLE_TRACING=ON + -DCMAKE_BUILD_TYPE=${{matrix.build_type}} + -DUR_BUILD_TESTS=ON + -DUR_FORMAT_CPP_STYLE=ON + -DUR_BUILD_ADAPTER_${{matrix.adapter}}=ON + + - name: Build + run: cmake --build ${{github.workspace}}/build -j $(nproc) + windows-build: name: Build - Windows strategy: diff --git a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch b/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch deleted file mode 100644 index 8153e1cb85..0000000000 --- a/source/adapters/0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch +++ /dev/null @@ -1,1048 +0,0 @@ -From fd78871a6bd2ff41ff37b8bd786c17f59911c677 Mon Sep 17 00:00:00 2001 -From: pbalcer -Date: Thu, 29 Jun 2023 15:11:43 +0200 -Subject: [PATCH] [SYCL][CUDA] remove sycl dependencies from cuda ur adapter - -This was preventing out-of-tree build of the adapter for standalone -use with unified runtime. - -Signed-off-by: Piotr Balcer ---- - .../ur/adapters/cuda/command_buffer.cpp | 52 ++--- - .../ur/adapters/cuda/common.cpp | 6 +- - .../ur/adapters/cuda/common.hpp | 5 - - .../ur/adapters/cuda/context.cpp | 2 +- - .../ur/adapters/cuda/device.cpp | 209 +++++++++--------- - .../ur/adapters/cuda/enqueue.cpp | 2 +- - .../ur/adapters/cuda/event.cpp | 17 +- - .../ur/adapters/cuda/kernel.cpp | 42 ++-- - .../ur/adapters/cuda/memory.cpp | 5 +- - .../ur/adapters/cuda/queue.cpp | 2 +- - .../ur/adapters/cuda/sampler.cpp | 2 +- - 11 files changed, 167 insertions(+), 177 deletions(-) - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -index c83e9e732303..57956cb64a67 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp -@@ -19,8 +19,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( - (void)hDevice; - (void)pCommandBufferDesc; - (void)phCommandBuffer; -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -28,8 +28,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -37,8 +37,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -46,8 +46,8 @@ UR_APIEXPORT ur_result_t UR_APICALL - urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -68,8 +68,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -86,8 +86,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -107,8 +107,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -134,8 +134,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -155,8 +155,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -175,8 +175,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -203,8 +203,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -232,8 +232,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( - (void)pSyncPointWaitList; - (void)pSyncPoint; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -247,7 +247,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - (void)phEventWaitList; - (void)phEvent; - -- sycl::detail::ur::die("Experimental Command-buffer feature is not " -- "implemented for CUDA adapter."); -+ detail::ur::die("Experimental Command-buffer feature is not " -+ "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -index 86975e509725..83264160e700 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp -@@ -72,17 +72,17 @@ std::string getCudaVersionString() { - return stream.str(); - } - --void sycl::detail::ur::die(const char *Message) { -+void detail::ur::die(const char *Message) { - std::cerr << "ur_die: " << Message << std::endl; - std::terminate(); - } - --void sycl::detail::ur::assertion(bool Condition, const char *Message) { -+void detail::ur::assertion(bool Condition, const char *Message) { - if (!Condition) - die(Message); - } - --void sycl::detail::ur::cuPrint(const char *Message) { -+void detail::ur::cuPrint(const char *Message) { - std::cerr << "ur_print: " << Message << std::endl; - } - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -index 5cfa609018b2..82b38c10d449 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp -@@ -8,7 +8,6 @@ - #pragma once - - #include --#include - #include - - ur_result_t mapErrorUR(CUresult Result); -@@ -37,8 +36,6 @@ extern thread_local char ErrorMessage[MaxMessageSize]; - ur_result_t ErrorCode); - - /// ------ Error handling, matching OpenCL plugin semantics. --namespace sycl { --__SYCL_INLINE_VER_NAMESPACE(_V1) { - namespace detail { - namespace ur { - -@@ -55,5 +52,3 @@ void assertion(bool Condition, const char *Message = nullptr); - - } // namespace ur - } // namespace detail --} // __SYCL_INLINE_VER_NAMESPACE(_V1) --} // namespace sycl -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -index 74a32bdac274..2b621383da09 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp -@@ -66,7 +66,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( - } - case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hContext->getDevice()->get()) == CUDA_SUCCESS); -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -index 52d4e3badc8f..a81599d629a7 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp -@@ -15,7 +15,7 @@ - - int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { - int value; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS); - return value; - } -@@ -53,11 +53,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int ComputeUnits = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ComputeUnits, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ComputeUnits >= 0); -+ detail::ur::assertion(ComputeUnits >= 0); - return ReturnValue(static_cast(ComputeUnits)); - } - case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { -@@ -69,20 +69,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } ReturnSizes; - - int MaxX = 0, MaxY = 0, MaxZ = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxX >= 0); -+ detail::ur::assertion(MaxX >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxY >= 0); -+ detail::ur::assertion(MaxY >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxZ >= 0); -+ detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); -@@ -95,20 +95,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - size_t Sizes[MaxWorkItemDimensions]; - } ReturnSizes; - int MaxX = 0, MaxY = 0, MaxZ = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxX >= 0); -+ detail::ur::assertion(MaxX >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxY >= 0); -+ detail::ur::assertion(MaxY >= 0); - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(MaxZ >= 0); -+ detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); -@@ -118,12 +118,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - - case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int MaxWorkGroupSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxWorkGroupSize, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); -+ detail::ur::assertion(MaxWorkGroupSize >= 0); - - return ReturnValue(size_t(MaxWorkGroupSize)); - } -@@ -172,14 +172,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxThreads, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; - return ReturnValue(MaxWarps); - } -@@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // Volta provides independent thread scheduling - // TODO: Revisit for previous generation GPUs - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -197,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - - case UR_DEVICE_INFO_ATOMIC_64: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -214,7 +214,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -255,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_BFLOAT16: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -266,18 +266,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // NVIDIA devices only support one sub-group size (the warp size) - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - size_t Sizes[1] = {static_cast(WarpSize)}; - return ReturnValue(Sizes, 1); - } - case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int ClockFreq = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, -- hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ClockFreq >= 0); -+ detail::ur::assertion(cuDeviceGetAttribute(&ClockFreq, -+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE, -+ hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(ClockFreq >= 0); - return ReturnValue(static_cast(ClockFreq) / 1000u); - } - case UR_DEVICE_INFO_ADDRESS_BITS: { -@@ -292,8 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // CL_DEVICE_TYPE_CUSTOM. - - size_t Global = 0; -- sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == -+ CUDA_SUCCESS); - - auto QuarterGlobal = static_cast(Global / 4u); - -@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { - Enabled = true; - } else { -- sycl::detail::ur::cuPrint( -+ detail::ur::cuPrint( - "Images are not fully supported by the CUDA BE, their support is " - "disabled by default. Their partial support can be activated by " - "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " -@@ -332,17 +332,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexHeight >= 0); -+ detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfHeight >= 0); -+ detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - -@@ -351,17 +351,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -370,17 +370,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexHeight >= 0); -+ detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfHeight, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfHeight >= 0); -+ detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - -@@ -389,17 +389,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -408,17 +408,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { - // Take the smaller of maximum surface and maximum texture depth. - int TexDepth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexDepth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexDepth >= 0); -+ detail::ur::assertion(TexDepth >= 0); - int SurfDepth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfDepth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfDepth >= 0); -+ detail::ur::assertion(SurfDepth >= 0); - - int Min = std::min(TexDepth, SurfDepth); - -@@ -427,17 +427,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&TexWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(TexWidth >= 0); -+ detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&SurfWidth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(SurfWidth >= 0); -+ detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - -@@ -459,7 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int MemBaseAddrAlign = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemBaseAddrAlign, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - hDevice->get()) == CUDA_SUCCESS); -@@ -504,27 +504,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int CacheSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(CacheSize >= 0); -+ detail::ur::assertion(CacheSize >= 0); - // The L2 cache is global to the GPU. - return ReturnValue(static_cast(CacheSize)); - } - case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t Bytes = 0; - // Runtime API has easy access to this value, driver API info is scarse. -- sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == -+ CUDA_SUCCESS); - return ReturnValue(uint64_t{Bytes}); - } - case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int ConstantMemory = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&ConstantMemory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(ConstantMemory >= 0); -+ detail::ur::assertion(ConstantMemory >= 0); - - return ReturnValue(static_cast(ConstantMemory)); - } -@@ -542,30 +542,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - // CUDA has its own definition of "local memory", which maps to OpenCL's - // "private memory". - int LocalMemSize = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&LocalMemSize, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(LocalMemSize >= 0); -+ detail::ur::assertion(LocalMemSize >= 0); - return ReturnValue(static_cast(LocalMemSize)); - } - case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ECCEnabled = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&ECCEnabled, -+ CU_DEVICE_ATTRIBUTE_ECC_ENABLED, -+ hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); -+ detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); - auto Result = static_cast(ECCEnabled); - return ReturnValue(Result); - } - case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int IsIntegrated = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&IsIntegrated, -+ CU_DEVICE_ATTRIBUTE_INTEGRATED, -+ hDevice->get()) == CUDA_SUCCESS); - -- sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); -+ detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); - auto result = static_cast(IsIntegrated); - return ReturnValue(result); - } -@@ -620,9 +620,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_NAME: { - static constexpr size_t MaxDeviceNameLength = 256u; - char Name[MaxDeviceNameLength]; -- sycl::detail::ur::assertion( -- cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetName(Name, MaxDeviceNameLength, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(Name, strlen(Name) + 1); - } - case UR_DEVICE_INFO_VENDOR: { -@@ -641,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_VERSION: { - std::stringstream SS; - int Major; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); - SS << Major; - int Minor; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -666,11 +665,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - int Major = 0; - int Minor = 0; - -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -847,27 +846,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { - size_t FreeMemory = 0; - size_t TotalMemory = 0; -- sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == -- CUDA_SUCCESS, -- "failed cuMemGetInfo() API."); -+ detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == -+ CUDA_SUCCESS, -+ "failed cuMemGetInfo() API."); - return ReturnValue(FreeMemory); - } - case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - // Convert kilohertz to megahertz when returning. - return ReturnValue(Value / 1000); - } - case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { -@@ -875,20 +874,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_DEVICE_ID: { - int Value = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion(Value >= 0); -+ detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_UUID: { - CUuuid UUID; - #if (CUDA_VERSION >= 11040) -- sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == -+ CUDA_SUCCESS); - #else -- sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == -+ CUDA_SUCCESS); - #endif - std::array Name; - std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); -@@ -896,13 +895,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } - case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { - int Major = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hDevice->get()) == CUDA_SUCCESS); - - int Minor = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - hDevice->get()) == CUDA_SUCCESS); -@@ -918,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - } else if (IsOrinAGX) { - MemoryClockKHz = 3200000; - } else { -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemoryClockKHz, - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - hDevice->get()) == CUDA_SUCCESS); -@@ -928,7 +927,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - if (IsOrinAGX) { - MemoryBusWidth = 256; - } else { -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MemoryBusWidth, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - hDevice->get()) == CUDA_SUCCESS); -@@ -973,7 +972,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - hDevice->get())); - -- sycl::detail::ur::assertion(MaxRegisters >= 0); -+ detail::ur::assertion(MaxRegisters >= 0); - - return ReturnValue(static_cast(MaxRegisters)); - } -@@ -984,12 +983,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - case UR_DEVICE_INFO_PCI_ADDRESS: { - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; -- sycl::detail::ur::assertion( -- cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == -- CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, -+ hDevice->get()) == CUDA_SUCCESS); - // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written -- sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == -- 12); -+ detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); - return ReturnValue(AddressBuffer, - strnlen(AddressBuffer, AddressBufferSize - 1) + 1); - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -index 1cfc5cc40a4a..792f69092682 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp -@@ -794,7 +794,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { - case CU_AD_FORMAT_FLOAT: - return 4; - default: -- sycl::detail::ur::die("Invalid image format."); -+ detail::ur::die("Invalid image format."); - return 0; - } - } -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -index 8916197b73f1..066c0498f1d0 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp -@@ -119,7 +119,7 @@ ur_result_t ur_event_handle_t_::record() { - try { - EventID = Queue->getNextEventID(); - if (EventID == 0) { -- sycl::detail::ur::die( -+ detail::ur::die( - "Unrecoverable program state reached in event identifier overflow"); - } - Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); -@@ -182,7 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, - case UR_EVENT_INFO_CONTEXT: - return ReturnValue(hEvent->getContext()); - default: -- sycl::detail::ur::die("Event info request not implemented"); -+ detail::ur::die("Event info request not implemented"); - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -@@ -213,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( - default: - break; - } -- sycl::detail::ur::die("Event Profiling info request not implemented"); -+ detail::ur::die("Event Profiling info request not implemented"); - return {}; - } - -@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, - ur_execution_info_t, - ur_event_callback_t, - void *) { -- sycl::detail::ur::die("Event Callback not implemented in CUDA adapter"); -+ detail::ur::die("Event Callback not implemented in CUDA adapter"); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - -@@ -254,8 +254,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { - - const auto RefCount = hEvent->incrementReferenceCount(); - -- sycl::detail::ur::assertion( -- RefCount != 0, "Reference count overflow detected in urEventRetain."); -+ detail::ur::assertion(RefCount != 0, -+ "Reference count overflow detected in urEventRetain."); - - return UR_RESULT_SUCCESS; - } -@@ -265,9 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. -- sycl::detail::ur::assertion( -- hEvent->getReferenceCount() != 0, -- "Reference count overflow detected in urEventRelease."); -+ detail::ur::assertion(hEvent->getReferenceCount() != 0, -+ "Reference count overflow detected in urEventRelease."); - - // decrement ref count. If it is 0, delete the event. - if (hEvent->decrementReferenceCount() == 0) { -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -index 358f59c499e1..7d46ce039bab 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp -@@ -73,24 +73,24 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - size_t GlobalWorkSize[3] = {0, 0, 0}; - - int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); - - int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - hDevice->get()) == CUDA_SUCCESS); -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - hDevice->get()) == CUDA_SUCCESS); - -@@ -101,7 +101,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - } - case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(size_t(MaxThreads)); -@@ -122,7 +122,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int Bytes = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(Bytes)); -@@ -130,17 +130,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int Bytes = 0; -- sycl::detail::ur::assertion( -- cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, -- hKernel->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuFuncGetAttribute(&Bytes, -+ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, -+ hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(Bytes)); - } - default: -@@ -231,9 +231,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, - return ReturnValue(""); - case UR_KERNEL_INFO_NUM_REGS: { - int NumRegs = 0; -- sycl::detail::ur::assertion( -- cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, -- hKernel->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuFuncGetAttribute(&NumRegs, -+ CU_FUNC_ATTRIBUTE_NUM_REGS, -+ hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(NumRegs)); - } - default: -@@ -254,15 +254,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { - // Sub-group size is equivalent to warp size - int WarpSize = 0; -- sycl::detail::ur::assertion( -- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, -- hDevice->get()) == CUDA_SUCCESS); -+ detail::ur::assertion(cuDeviceGetAttribute(&WarpSize, -+ CU_DEVICE_ATTRIBUTE_WARP_SIZE, -+ hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hKernel->get()) == CUDA_SUCCESS); - int WarpSize = 0; -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -index b19acea3159f..f0c276579476 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp -@@ -162,8 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { - // error for which it is unclear if the function that reported it succeeded - // or not. Either way, the state of the program is compromised and likely - // unrecoverable. -- sycl::detail::ur::die( -- "Unrecoverable program state reached in urMemRelease"); -+ detail::ur::die("Unrecoverable program state reached in urMemRelease"); - } - - return UR_RESULT_SUCCESS; -@@ -331,7 +330,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( - PixelTypeSizeBytes = 4; - break; - default: -- sycl::detail::ur::die( -+ detail::ur::die( - "urMemImageCreate given unsupported image_channel_data_type"); - } - -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -index 05443eeed89d..32391fec5c13 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp -@@ -265,7 +265,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( - else if (CuFlags == CU_STREAM_NON_BLOCKING) - Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; - else -- sycl::detail::ur::die("Unknown cuda stream"); -+ detail::ur::die("Unknown cuda stream"); - - std::vector ComputeCuStreams(1, CuStream); - std::vector TransferCuStreams(0); -diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -index 36ec89fb9da3..836e47f988e5 100644 ---- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp -@@ -73,7 +73,7 @@ urSamplerRelease(ur_sampler_handle_t hSampler) { - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. -- sycl::detail::ur::assertion( -+ detail::ur::assertion( - hSampler->getReferenceCount() != 0, - "Reference count overflow detected in urSamplerRelease."); - --- -2.41.0 - diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 57df48f95f..aa09004667 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -9,15 +9,7 @@ add_subdirectory(null) if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") - FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230713 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) - - get_program_version_major_minor(git GIT_VERSION) - set(GIT_QUIET_OPTION "") - if(GIT_VERSION VERSION_GREATER_EQUAL "3.35.0") - set(GIT_QUIET_OPTION "--quiet") - endif() - execute_process(COMMAND git apply ${GIT_QUIET_OPTION} ../0001-SYCL-CUDA-remove-sycl-namespaces-from-ur-adapter.patch - WORKING_DIRECTORY ${SYCL_ADAPTER_DIR}) + FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230717 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) endif() if(UR_BUILD_ADAPTER_L0) diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 82ab485119..7db8bbf3fc 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -75,6 +75,12 @@ add_library(${TARGET_NAME} ${HIP_DIR}/../../usm_allocator_config.hpp ) +if (NOT MSVC) + target_compile_options(${TARGET_NAME} PRIVATE + -Wno-deprecated-declarations + ) +endif() + set_target_properties(${TARGET_NAME} PROPERTIES VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" SOVERSION "${PROJECT_VERSION_MAJOR}" From e97c22d1ed6fc1905a1f44dfce415729634cf16a Mon Sep 17 00:00:00 2001 From: pbalcer Date: Tue, 18 Jul 2023 14:14:31 +0200 Subject: [PATCH 11/16] [adapters] only export get proc table symbols in adapters This adds a version script to restrict visiblity of symbols in adapters. This is so that adapters don't use loader ur functions when populating proc tables... --- source/adapters/CMakeLists.txt | 24 ++++++++++++++++++++++- source/adapters/adapter.def.in | 20 +++++++++++++++++++ source/adapters/adapter.map.in | 23 ++++++++++++++++++++++ source/adapters/cuda/CMakeLists.txt | 2 +- source/adapters/hip/CMakeLists.txt | 4 ++-- source/adapters/level_zero/CMakeLists.txt | 8 +------- source/adapters/null/CMakeLists.txt | 7 +------ 7 files changed, 71 insertions(+), 17 deletions(-) create mode 100644 source/adapters/adapter.def.in create mode 100644 source/adapters/adapter.map.in diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index aa09004667..8a885923c0 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -3,8 +3,30 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -add_subdirectory(null) +function(add_ur_adapter name) + add_library(${name} ${ARGN}) + if(MSVC) + set(TARGET_LIBNAME ${name}) + string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) + + set(ADAPTER_VERSION_SCRIPT ${name}.def) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.def.in ${ADAPTER_VERSION_SCRIPT} @ONLY) + set_target_properties(${name} PROPERTIES + LINK_FLAGS "/DEF:${ADAPTER_VERSION_SCRIPT}" + ) + else() + set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0) + string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) + set(ADAPTER_VERSION_SCRIPT ${name}.map) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.map.in ${ADAPTER_VERSION_SCRIPT} @ONLY) + target_link_options(${name} PRIVATE "-Wl,--version-script=${ADAPTER_VERSION_SCRIPT}") + endif() +endfunction() + +add_subdirectory(null) if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP) # fetch adapter sources from SYCL diff --git a/source/adapters/adapter.def.in b/source/adapters/adapter.def.in new file mode 100644 index 0000000000..bfe14a6a03 --- /dev/null +++ b/source/adapters/adapter.def.in @@ -0,0 +1,20 @@ +LIBRARY @TARGET_LIBNAME@ +EXPORTS + urGetBindlessImagesExpProcAddrTable + urGetCommandBufferExpProcAddrTable + urGetContextProcAddrTable + urGetDeviceProcAddrTable + urGetEnqueueProcAddrTable + urGetEventProcAddrTable + urGetGlobalProcAddrTable + urGetKernelProcAddrTable + urGetMemProcAddrTable + urGetPhysicalMemProcAddrTable + urGetPlatformProcAddrTable + urGetProgramProcAddrTable + urGetQueueProcAddrTable + urGetSamplerProcAddrTable + urGetUSMExpProcAddrTable + urGetUsmP2PExpProcAddrTable + urGetUSMProcAddrTable + urGetVirtualMemProcAddrTable diff --git a/source/adapters/adapter.map.in b/source/adapters/adapter.map.in new file mode 100644 index 0000000000..cbb5c6c4cb --- /dev/null +++ b/source/adapters/adapter.map.in @@ -0,0 +1,23 @@ +@TARGET_LIBNAME@ { + global: + urGetBindlessImagesExpProcAddrTable; + urGetCommandBufferExpProcAddrTable; + urGetContextProcAddrTable; + urGetDeviceProcAddrTable; + urGetEnqueueProcAddrTable; + urGetEventProcAddrTable; + urGetGlobalProcAddrTable; + urGetKernelProcAddrTable; + urGetMemProcAddrTable; + urGetPhysicalMemProcAddrTable; + urGetPlatformProcAddrTable; + urGetProgramProcAddrTable; + urGetQueueProcAddrTable; + urGetSamplerProcAddrTable; + urGetUSMExpProcAddrTable; + urGetUsmP2PExpProcAddrTable; + urGetUSMProcAddrTable; + urGetVirtualMemProcAddrTable; + local: + *; +}; diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index d0bc3fd6b2..29bdc87152 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -7,7 +7,7 @@ set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda" set(TARGET_NAME ur_adapter_cuda) -add_library(${TARGET_NAME} +add_ur_adapter(${TARGET_NAME} SHARED ${CUDA_DIR}/ur_interface_loader.cpp ${CUDA_DIR}/common.hpp diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 7db8bbf3fc..a014725268 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -42,7 +42,7 @@ endif() # Set includes used in added library (rocmdrv) set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}") -add_library(${TARGET_NAME} +add_ur_adapter(${TARGET_NAME} SHARED ${HIP_DIR}/ur_interface_loader.cpp ${HIP_DIR}/common.hpp @@ -75,7 +75,7 @@ add_library(${TARGET_NAME} ${HIP_DIR}/../../usm_allocator_config.hpp ) -if (NOT MSVC) +if(NOT MSVC) target_compile_options(${TARGET_NAME} PRIVATE -Wno-deprecated-declarations ) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 52d22c3ea6..fa089c4b9a 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -71,7 +71,7 @@ target_include_directories(LevelZeroLoader-Headers INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}" ) -add_library(${TARGET_NAME} +add_ur_adapter(${TARGET_NAME} SHARED ${L0_DIR}/ur_interface_loader.cpp ${L0_DIR}/common.hpp @@ -120,9 +120,3 @@ target_include_directories(${TARGET_NAME} PRIVATE ${L0_DIR}/../../../ LevelZeroLoader-Headers ) - -if(UNIX) - set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}") -endif() - diff --git a/source/adapters/null/CMakeLists.txt b/source/adapters/null/CMakeLists.txt index 56b8815929..0d4aa13e01 100644 --- a/source/adapters/null/CMakeLists.txt +++ b/source/adapters/null/CMakeLists.txt @@ -5,7 +5,7 @@ set(TARGET_NAME ur_adapter_null) -add_library(${TARGET_NAME} +add_ur_adapter(${TARGET_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ur_null.hpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_null.cpp @@ -21,8 +21,3 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common ) - -if(UNIX) - set(GCC_COVERAGE_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fno-strict-aliasing") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_COVERAGE_COMPILE_FLAGS}") -endif() From 189588362c2420dcc828804e53b9789b32ec2741 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Fri, 28 Jul 2023 16:31:54 +0100 Subject: [PATCH 12/16] [ur] Fix adapters linker script path on Linux Use an absolute path instead of just the filename for the `ur_adapter_.map` files to resolve `/usr/bin/ld: cannot open linker script file` errors. --- source/adapters/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 8a885923c0..c003671e0c 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -19,7 +19,7 @@ function(add_ur_adapter name) set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0) string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) - set(ADAPTER_VERSION_SCRIPT ${name}.map) + set(ADAPTER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/${name}.map) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../adapter.map.in ${ADAPTER_VERSION_SCRIPT} @ONLY) target_link_options(${name} PRIVATE "-Wl,--version-script=${ADAPTER_VERSION_SCRIPT}") From fcded3d087c1218b1172dad74f3cb976005b2f4d Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Fri, 14 Jul 2023 13:06:41 +0100 Subject: [PATCH 13/16] [UR] Implement CUDA native handle tests --- test/conformance/CMakeLists.txt | 1 + test/conformance/adapters/CMakeLists.txt | 3 ++ test/conformance/adapters/cuda/CMakeLists.txt | 15 ++++++++ .../conformance/adapters/cuda/cuda_fixtures.h | 38 +++++++++++++++++++ .../cuda/cuda_urContextGetNativeHandle.cpp | 13 +++++++ .../cuda_urDeviceCreateWithNativeHandle.cpp | 18 +++++++++ .../cuda/cuda_urDeviceGetNativeHandle.cpp | 15 ++++++++ .../cuda_urEventCreateWithNativeHandle.cpp | 19 ++++++++++ .../cuda/cuda_urEventGetNativeHandle.cpp | 25 ++++++++++++ 9 files changed, 147 insertions(+) create mode 100644 test/conformance/adapters/CMakeLists.txt create mode 100644 test/conformance/adapters/cuda/CMakeLists.txt create mode 100644 test/conformance/adapters/cuda/cuda_fixtures.h create mode 100644 test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp create mode 100644 test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp create mode 100644 test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp create mode 100644 test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp create mode 100644 test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 5c14d3e34a..fe6eb43f9b 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -45,6 +45,7 @@ function(add_conformance_test_with_platform_environment name) endfunction() add_subdirectory(testing) +add_subdirectory(adapters) add_subdirectory(platform) add_subdirectory(device) diff --git a/test/conformance/adapters/CMakeLists.txt b/test/conformance/adapters/CMakeLists.txt new file mode 100644 index 0000000000..c49bdfd412 --- /dev/null +++ b/test/conformance/adapters/CMakeLists.txt @@ -0,0 +1,3 @@ +if(UR_BUILD_ADAPTER_CUDA) + add_subdirectory(cuda) +endif() diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt new file mode 100644 index 0000000000..bed37d8070 --- /dev/null +++ b/test/conformance/adapters/cuda/CMakeLists.txt @@ -0,0 +1,15 @@ + +add_conformance_test_with_devices_environment(adapter-cuda + cuda_fixtures.h + cuda_urContextGetNativeHandle.cpp + cuda_urDeviceGetNativeHandle.cpp + cuda_urDeviceCreateWithNativeHandle.cpp + cuda_urEventGetNativeHandle.cpp + cuda_urEventCreateWithNativeHandle.cpp +) +target_link_libraries(test-adapter-cuda PRIVATE cudadrv) + +set_tests_properties(adapter-cuda PROPERTIES + LABELS "conformance:cuda" + ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" + ) diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h new file mode 100644 index 0000000000..2624abc434 --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_fixtures.h @@ -0,0 +1,38 @@ +#ifndef UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED +#define UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED +#include +#include + +namespace uur { +struct ResultCuda { + + constexpr ResultCuda(CUresult result) noexcept : value(result) {} + + inline bool operator==(const ResultCuda &rhs) const noexcept { + return rhs.value == value; + } + + CUresult value; +}; +} // namespace uur + +#ifndef ASSERT_EQ_RESULT_CUDA +#define ASSERT_EQ_RESULT_CUDA(EXPECTED, ACTUAL) \ + ASSERT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL)) +#endif // ASSERT_EQ_RESULT_CUDA + +#ifndef ASSERT_SUCCESS_CUDA +#define ASSERT_SUCCESS_CUDA(ACTUAL) ASSERT_EQ_RESULT_CUDA(CUDA_SUCCESS, ACTUAL) +#endif // ASSERT_SUCCESS_CUDA + +#ifndef EXPECT_EQ_RESULT_CUDA +#define EXPECT_EQ_RESULT_CUDA(EXPECTED, ACTUAL) \ + EXPECT_EQ_RESULT_CUDA(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL)) +#endif // EXPECT_EQ_RESULT_CUDA + +#ifndef EXPECT_SUCCESS_CUDA +#define EXPECT_SUCCESS_CUDA(ACTUAL) \ + EXPECT_EQ_RESULT_CUDA(UR_RESULT_SUCCESS, ACTUAL) +#endif // EXPECT_EQ_RESULT_CUDA + +#endif // UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED diff --git a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp new file mode 100644 index 0000000000..b9d199516d --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp @@ -0,0 +1,13 @@ +#include "cuda_fixtures.h" + +using urCudaContextGetNativeHandle = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaContextGetNativeHandle); + +TEST_P(urCudaContextGetNativeHandle, Success) { + ur_native_handle_t native_context = nullptr; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + CUcontext cuda_context = reinterpret_cast(native_context); + + unsigned int cudaVersion; + ASSERT_SUCCESS_CUDA(cuCtxGetApiVersion(cuda_context, &cudaVersion)); +} diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp new file mode 100644 index 0000000000..89fddfaf83 --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp @@ -0,0 +1,18 @@ +#include "cuda_fixtures.h" + +using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest; + +TEST_F(urCudaDeviceCreateWithNativeHandle, Success) { + // get a device from cuda + int nCudaDevices; + ASSERT_SUCCESS_CUDA(cuDeviceGetCount(&nCudaDevices)); + ASSERT_GT(nCudaDevices, 0); + CUdevice cudaDevice; + ASSERT_SUCCESS_CUDA(cuDeviceGet(&cudaDevice, 0)); + + ur_native_handle_t nativeCuda = + reinterpret_cast(cudaDevice); + ur_device_handle_t urDevice; + ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr, + &urDevice)); +} diff --git a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp new file mode 100644 index 0000000000..cfedae68ad --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp @@ -0,0 +1,15 @@ +#include "cuda_fixtures.h" + +using urCudaGetDeviceNativeHandle = uur::urDeviceTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaGetDeviceNativeHandle); + +TEST_P(urCudaGetDeviceNativeHandle, Success) { + ur_native_handle_t native_handle; + ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle)); + + CUdevice cuda_device = *reinterpret_cast(&native_handle); + + char cuda_device_name[256]; + ASSERT_SUCCESS_CUDA(cuDeviceGetName(cuda_device_name, + sizeof(cuda_device_name), cuda_device)); +} diff --git a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp new file mode 100644 index 0000000000..d85b83902f --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp @@ -0,0 +1,19 @@ +#include "cuda_fixtures.h" + +using urCudaEventCreateWithNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEventCreateWithNativeHandleTest); + +TEST_P(urCudaEventCreateWithNativeHandleTest, Success) { + + CUevent cuda_event; + ASSERT_SUCCESS_CUDA(cuEventCreate(&cuda_event, CU_EVENT_DEFAULT)); + + ur_native_handle_t native_event = + reinterpret_cast(cuda_event); + + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEventCreateWithNativeHandle(native_event, context, nullptr, &event)); + + ASSERT_SUCCESS(urEventRelease(event)); +} diff --git a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp new file mode 100644 index 0000000000..484b2e88c6 --- /dev/null +++ b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp @@ -0,0 +1,25 @@ +#include "cuda_fixtures.h" + +using urCudaEventGetNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEventGetNativeHandleTest); + +TEST_P(urCudaEventGetNativeHandleTest, Success) { + constexpr size_t buffer_size = 1024; + ur_mem_handle_t mem = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + buffer_size, nullptr, &mem)); + + ur_event_handle_t event = nullptr; + uint8_t pattern = 6; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem, &pattern, sizeof(pattern), + 0, buffer_size, 0, nullptr, &event)); + + ur_native_handle_t native_event = nullptr; + ASSERT_SUCCESS(urEventGetNativeHandle(event, &native_event)); + CUevent cuda_event = reinterpret_cast(native_event); + + ASSERT_SUCCESS_CUDA(cuEventSynchronize(cuda_event)); + + ASSERT_SUCCESS(urEventRelease(event)); + ASSERT_SUCCESS(urMemRelease(mem)); +} From ca91dc97611bc8abe542e681539be36eb07f854c Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Tue, 1 Aug 2023 12:52:48 +0100 Subject: [PATCH 14/16] [UR] Add license to missing files --- test/conformance/adapters/CMakeLists.txt | 5 +++++ test/conformance/adapters/cuda/CMakeLists.txt | 4 ++++ test/conformance/adapters/cuda/cuda_fixtures.h | 5 +++++ .../adapters/cuda/cuda_urContextGetNativeHandle.cpp | 5 +++++ .../adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp | 5 +++++ .../adapters/cuda/cuda_urDeviceGetNativeHandle.cpp | 5 +++++ .../adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp | 5 +++++ .../adapters/cuda/cuda_urEventGetNativeHandle.cpp | 5 +++++ 8 files changed, 39 insertions(+) diff --git a/test/conformance/adapters/CMakeLists.txt b/test/conformance/adapters/CMakeLists.txt index c49bdfd412..7b9324d5c5 100644 --- a/test/conformance/adapters/CMakeLists.txt +++ b/test/conformance/adapters/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2023 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + if(UR_BUILD_ADAPTER_CUDA) add_subdirectory(cuda) endif() diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt index bed37d8070..241eb87a8c 100644 --- a/test/conformance/adapters/cuda/CMakeLists.txt +++ b/test/conformance/adapters/cuda/CMakeLists.txt @@ -1,3 +1,7 @@ +# Copyright (C) 2023 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_conformance_test_with_devices_environment(adapter-cuda cuda_fixtures.h diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h index 2624abc434..e367a4aa2c 100644 --- a/test/conformance/adapters/cuda/cuda_fixtures.h +++ b/test/conformance/adapters/cuda/cuda_fixtures.h @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #ifndef UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED #define UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED #include diff --git a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp index b9d199516d..0a2c855360 100644 --- a/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp +++ b/test/conformance/adapters/cuda/cuda_urContextGetNativeHandle.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "cuda_fixtures.h" using urCudaContextGetNativeHandle = uur::urContextTest; diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp index 89fddfaf83..b116c9a5c9 100644 --- a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp +++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "cuda_fixtures.h" using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest; diff --git a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp index cfedae68ad..3d2cfd33b7 100644 --- a/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp +++ b/test/conformance/adapters/cuda/cuda_urDeviceGetNativeHandle.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "cuda_fixtures.h" using urCudaGetDeviceNativeHandle = uur::urDeviceTest; diff --git a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp index d85b83902f..94ae9ad80b 100644 --- a/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp +++ b/test/conformance/adapters/cuda/cuda_urEventCreateWithNativeHandle.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "cuda_fixtures.h" using urCudaEventCreateWithNativeHandleTest = uur::urQueueTest; diff --git a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp index 484b2e88c6..a6185868fb 100644 --- a/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp +++ b/test/conformance/adapters/cuda/cuda_urEventGetNativeHandle.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "cuda_fixtures.h" using urCudaEventGetNativeHandleTest = uur::urQueueTest; From 759e7535732df8f19a3134fc77178a90adf11fdf Mon Sep 17 00:00:00 2001 From: pbalcer Date: Tue, 1 Aug 2023 12:11:49 +0200 Subject: [PATCH 15/16] [ADAPTERS] update to latest from sycl --- CMakeLists.txt | 2 +- source/adapters/CMakeLists.txt | 6 ++++-- source/adapters/cuda/CMakeLists.txt | 10 ++++++---- source/adapters/hip/CMakeLists.txt | 10 ++++++---- source/adapters/level_zero/CMakeLists.txt | 10 ++++++---- .../src/memory_provider_get_last_failed.cpp | 2 +- .../src/memory_provider_internal.h | 2 +- 7 files changed, 25 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1107f0ac3e..9971d3f40d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ if(UR_ENABLE_TRACING) set(XPTI_DIR ${xpti_SOURCE_DIR}) set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests") - FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230304" "xptifw") + FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw") FetchContent_MakeAvailable(xptifw) diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index c003671e0c..8de581ac68 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception function(add_ur_adapter name) - add_library(${name} ${ARGN}) + add_ur_library(${name} ${ARGN}) if(MSVC) set(TARGET_LIBNAME ${name}) string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) @@ -15,6 +15,8 @@ function(add_ur_adapter name) set_target_properties(${name} PROPERTIES LINK_FLAGS "/DEF:${ADAPTER_VERSION_SCRIPT}" ) + elseif(APPLE) + target_compile_options(${name} PRIVATE "-fvisibility=hidden") else() set(TARGET_LIBNAME lib${name}_${PROJECT_VERSION_MAJOR}.0) string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) @@ -31,7 +33,7 @@ add_subdirectory(null) if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP) # fetch adapter sources from SYCL set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") - FetchSource(https://github.com/intel/llvm.git sycl-nightly/20230717 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) + FetchSource(https://github.com/intel/llvm.git nightly-2023-08-01 "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) endif() if(UR_BUILD_ADAPTER_L0) diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index 29bdc87152..f85d759c09 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -10,6 +10,10 @@ set(TARGET_NAME ur_adapter_cuda) add_ur_adapter(${TARGET_NAME} SHARED ${CUDA_DIR}/ur_interface_loader.cpp + ${CUDA_DIR}/adapter.hpp + ${CUDA_DIR}/adapter.cpp + ${CUDA_DIR}/command_buffer.hpp + ${CUDA_DIR}/command_buffer.cpp ${CUDA_DIR}/common.hpp ${CUDA_DIR}/common.cpp ${CUDA_DIR}/context.hpp @@ -33,12 +37,9 @@ add_ur_adapter(${TARGET_NAME} ${CUDA_DIR}/sampler.cpp ${CUDA_DIR}/tracing.cpp ${CUDA_DIR}/usm.cpp + ${CUDA_DIR}/usm_p2p.cpp ${CUDA_DIR}/../../ur.cpp ${CUDA_DIR}/../../ur.hpp - ${CUDA_DIR}/../../usm_allocator.cpp - ${CUDA_DIR}/../../usm_allocator.hpp - ${CUDA_DIR}/../../usm_allocator_config.cpp - ${CUDA_DIR}/../../usm_allocator_config.hpp ) set_target_properties(${TARGET_NAME} PROPERTIES @@ -69,6 +70,7 @@ endif() target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::unified_malloc_framework Threads::Threads cudadrv ) diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index a014725268..2f205d84e6 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -45,6 +45,10 @@ set(HIP_HEADERS "${UR_HIP_INCLUDE_DIR};${UR_HIP_HSA_INCLUDE_DIR}") add_ur_adapter(${TARGET_NAME} SHARED ${HIP_DIR}/ur_interface_loader.cpp + ${HIP_DIR}/adapter.hpp + ${HIP_DIR}/adapter.cpp + ${HIP_DIR}/command_buffer.hpp + ${HIP_DIR}/command_buffer.cpp ${HIP_DIR}/common.hpp ${HIP_DIR}/common.cpp ${HIP_DIR}/context.hpp @@ -67,12 +71,9 @@ add_ur_adapter(${TARGET_NAME} ${HIP_DIR}/sampler.hpp ${HIP_DIR}/sampler.cpp ${HIP_DIR}/usm.cpp + ${HIP_DIR}/usm_p2p.cpp ${HIP_DIR}/../../ur.cpp ${HIP_DIR}/../../ur.hpp - ${HIP_DIR}/../../usm_allocator.cpp - ${HIP_DIR}/../../usm_allocator.hpp - ${HIP_DIR}/../../usm_allocator_config.cpp - ${HIP_DIR}/../../usm_allocator_config.hpp ) if(NOT MSVC) @@ -100,6 +101,7 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::unified_malloc_framework rocmdrv ) diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index fa089c4b9a..c361c230d3 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -74,6 +74,10 @@ target_include_directories(LevelZeroLoader-Headers add_ur_adapter(${TARGET_NAME} SHARED ${L0_DIR}/ur_interface_loader.cpp + ${L0_DIR}/adapter.hpp + ${L0_DIR}/adapter.cpp + ${L0_DIR}/command_buffer.hpp + ${L0_DIR}/command_buffer.cpp ${L0_DIR}/common.hpp ${L0_DIR}/context.hpp ${L0_DIR}/device.hpp @@ -91,6 +95,7 @@ add_ur_adapter(${TARGET_NAME} ${L0_DIR}/device.cpp ${L0_DIR}/event.cpp ${L0_DIR}/usm.cpp + ${L0_DIR}/usm_p2p.cpp ${L0_DIR}/memory.cpp ${L0_DIR}/kernel.cpp ${L0_DIR}/platform.cpp @@ -98,10 +103,6 @@ add_ur_adapter(${TARGET_NAME} ${L0_DIR}/queue.cpp ${L0_DIR}/sampler.cpp ${L0_DIR}/../../ur.cpp - ${L0_DIR}/../../usm_allocator.cpp - ${L0_DIR}/../../usm_allocator.hpp - ${L0_DIR}/../../usm_allocator_config.cpp - ${L0_DIR}/../../usm_allocator_config.hpp ) set_target_properties(${TARGET_NAME} PROPERTIES @@ -112,6 +113,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::unified_malloc_framework LevelZeroLoader LevelZeroLoader-Headers ) diff --git a/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp b/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp index c439213a26..f9af93206a 100644 --- a/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp +++ b/source/common/unified_malloc_framework/src/memory_provider_get_last_failed.cpp @@ -14,7 +14,7 @@ extern "C" { static thread_local umf_memory_provider_handle_t lastFailedProvider = nullptr; -umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr() { +umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr(void) { return &lastFailedProvider; } } diff --git a/source/common/unified_malloc_framework/src/memory_provider_internal.h b/source/common/unified_malloc_framework/src/memory_provider_internal.h index 07befd4b4e..2bad161706 100644 --- a/source/common/unified_malloc_framework/src/memory_provider_internal.h +++ b/source/common/unified_malloc_framework/src/memory_provider_internal.h @@ -18,7 +18,7 @@ extern "C" { #endif void *umfMemoryProviderGetPriv(umf_memory_provider_handle_t hProvider); -umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr(); +umf_memory_provider_handle_t *umfGetLastFailedMemoryProviderPtr(void); #ifdef __cplusplus } From 1984c1ddff5dd6b7d8a01348aa98eca500986b59 Mon Sep 17 00:00:00 2001 From: Patryk Kaminski Date: Fri, 30 Jun 2023 16:21:32 +0200 Subject: [PATCH 16/16] Test parts of API with libFuzzer. Co-authored-by: omar.ahmed@codeplay.com --- .github/workflows/cmake.yml | 7 + source/adapters/null/ur_null.cpp | 63 +++++ test/CMakeLists.txt | 3 + test/conformance/CMakeLists.txt | 2 +- test/fuzz/CMakeLists.txt | 35 +++ test/fuzz/urFuzz.cpp | 397 +++++++++++++++++++++++++++++++ test/fuzz/utils.hpp | 199 ++++++++++++++++ 7 files changed, 705 insertions(+), 1 deletion(-) create mode 100644 test/fuzz/CMakeLists.txt create mode 100644 test/fuzz/urFuzz.cpp create mode 100644 test/fuzz/utils.hpp diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 306552a110..a703fd8d84 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -17,6 +17,7 @@ jobs: build_type: Release compiler: {c: clang, cxx: clang++} libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF' + fuzztest: ON - os: 'ubuntu-22.04' build_type: Release compiler: {c: gcc, cxx: g++} @@ -25,6 +26,7 @@ jobs: build_type: Release compiler: {c: clang, cxx: clang++} libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON' + fuzztest: ON - os: 'ubuntu-20.04' build_type: Release compiler: {c: gcc-7, cxx: g++-7} @@ -88,6 +90,11 @@ jobs: working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "python|umf|loader|validation|tracing|unit|urtrace" + - name: Fuzz test + working-directory: ${{github.workspace}}/build + if: matrix.fuzztest == 'ON' + run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz" + adapter-build: name: Build - Adapters on Ubuntu strategy: diff --git a/source/adapters/null/ur_null.cpp b/source/adapters/null/ur_null.cpp index 18c8d89ef5..5a62761b67 100644 --- a/source/adapters/null/ur_null.cpp +++ b/source/adapters/null/ur_null.cpp @@ -163,5 +163,68 @@ context_t::context_t() { } return UR_RESULT_SUCCESS; }; + + ////////////////////////////////////////////////////////////////////////// + urDdiTable.USM.pfnHostAlloc = + [](ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + if (size == 0) { + *ppMem = nullptr; + return UR_RESULT_ERROR_UNSUPPORTED_SIZE; + } + *ppMem = malloc(size); + if (ppMem == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + return UR_RESULT_SUCCESS; + }; + + ////////////////////////////////////////////////////////////////////////// + urDdiTable.USM.pfnDeviceAlloc = + [](ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { + if (size == 0) { + *ppMem = nullptr; + return UR_RESULT_ERROR_UNSUPPORTED_SIZE; + } + *ppMem = malloc(size); + if (ppMem == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + return UR_RESULT_SUCCESS; + }; + + ////////////////////////////////////////////////////////////////////////// + urDdiTable.USM.pfnFree = [](ur_context_handle_t hContext, void *pMem) { + free(pMem); + return UR_RESULT_SUCCESS; + }; + + ////////////////////////////////////////////////////////////////////////// + urDdiTable.USM.pfnGetMemAllocInfo = + [](ur_context_handle_t hContext, const void *pMem, + ur_usm_alloc_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + switch (propName) { + case UR_USM_ALLOC_INFO_TYPE: + *reinterpret_cast(pPropValue) = + pMem ? UR_USM_TYPE_DEVICE : UR_USM_TYPE_UNKNOWN; + if (pPropSizeRet != nullptr) { + *pPropSizeRet = sizeof(ur_usm_type_t); + } + break; + case UR_USM_ALLOC_INFO_SIZE: + *reinterpret_cast(pPropValue) = pMem ? SIZE_MAX : 0; + if (pPropSizeRet != nullptr) { + *pPropSizeRet = sizeof(size_t); + } + break; + default: + pPropValue = nullptr; + break; + } + return UR_RESULT_SUCCESS; + }; } } // namespace driver diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 79ca48236c..72564667ed 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,3 +24,6 @@ add_subdirectory(unit) if(UR_BUILD_TOOLS) add_subdirectory(tools) endif() +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_subdirectory(fuzz) +endif() diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index c078168cf9..eec9f37215 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -62,7 +62,7 @@ if(DEFINED UR_DPCXX) add_custom_target(generate_device_binaries) set(UR_CONFORMANCE_DEVICE_BINARIES_DIR - "${CMAKE_CURRENT_BINARY_DIR}/device_binaries/") + "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL UR_CONFORMANCE_DEVICE_BINARIES_DIR) file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}) if(DEFINED UR_CONFORMANCE_TARGET_TRIPLES) diff --git a/test/fuzz/CMakeLists.txt b/test/fuzz/CMakeLists.txt new file mode 100644 index 0000000000..8444e67c64 --- /dev/null +++ b/test/fuzz/CMakeLists.txt @@ -0,0 +1,35 @@ +# Copyright (C) 2023 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +function(add_fuzz_test name) + set(TEST_TARGET_NAME fuzztest-${name}) + add_ur_executable(${TEST_TARGET_NAME} + ${ARGN}) + target_link_libraries(${TEST_TARGET_NAME} + PRIVATE + ${PROJECT_NAME}::loader + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + -fsanitize=fuzzer -fprofile-instr-generate -fcoverage-mapping) + add_test(NAME ${TEST_TARGET_NAME} + COMMAND ${TEST_TARGET_NAME} -max_total_time=600 -seed=1 -shrink=1 -verbosity=1 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set_tests_properties(${TEST_TARGET_NAME} PROPERTIES + LABELS "fuzz" + ENVIRONMENT + "XPTI_TRACE_ENABLE=1" + "XPTI_FRAMEWORK_DISPATCHER=$" + "XPTI_SUBSCRIBERS=$" + "UR_ENABLE_LAYERS=UR_LAYER_TRACING" + "UR_ADAPTERS_FORCE_LOAD=\"$\"") + target_compile_options(${TEST_TARGET_NAME} PRIVATE -g -fsanitize=fuzzer -fprofile-instr-generate -fcoverage-mapping) + target_compile_definitions(${TEST_TARGET_NAME} PRIVATE -DKERNEL_IL_PATH="${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/bar/sycl_spir641.spv") + target_include_directories(${TEST_TARGET_NAME} PRIVATE ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}) + + add_dependencies(${TEST_TARGET_NAME} generate_device_binaries) +endfunction() + +add_fuzz_test(base + urFuzz.cpp) diff --git a/test/fuzz/urFuzz.cpp b/test/fuzz/urFuzz.cpp new file mode 100644 index 0000000000..f88b496051 --- /dev/null +++ b/test/fuzz/urFuzz.cpp @@ -0,0 +1,397 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "kernel_entry_points.h" +#include "ur_api.h" +#include "utils.hpp" + +namespace fuzz { + +int ur_platform_get(TestState &state) { + ur_result_t res = urPlatformGet( + state.adapters.data(), state.adapters.size(), state.num_entries, + state.platforms.data(), &state.num_platforms); + if (res != UR_RESULT_SUCCESS) { + return -1; + } + if (state.platforms.size() != state.num_platforms) { + state.platforms.resize(state.num_platforms); + } + + return 0; +} + +int ur_device_get(TestState &state) { + if (state.platforms.empty() || + state.platform_num >= state.platforms.size() || + state.platforms[0] == nullptr) { + return -1; + } + + ur_result_t res = urDeviceGet(state.platforms[state.platform_num], + state.device_type, state.num_entries, + state.devices.data(), &state.num_devices); + if (res != UR_RESULT_SUCCESS) { + return -1; + } + if (state.devices.size() != state.num_devices) { + state.devices.resize(state.num_devices); + } + + return 0; +} + +int ur_device_release(TestState &state) { + if (state.devices.empty()) { + return -1; + } + + ur_result_t res = urDeviceRelease(state.devices.back()); + if (res == UR_RESULT_SUCCESS) { + state.devices.pop_back(); + } + + return 0; +} + +int ur_context_create(TestState &state) { + if (!check_device_exists(&state)) { + return -1; + } + + ur_context_handle_t context; + ur_result_t res = urContextCreate(state.devices.size(), + state.devices.data(), nullptr, &context); + if (res == UR_RESULT_SUCCESS) { + state.contexts.push_back(context); + } + + return 0; +} + +int ur_context_release(TestState &state) { + if (!check_context_exists(&state) || + state.contexts[state.context_num] == state.contexts.back()) { + return -1; + } + + ur_result_t res = urContextRelease(state.contexts.back()); + if (res == UR_RESULT_SUCCESS) { + state.contexts.pop_back(); + } + + return 0; +} + +int pool_create( + TestState &state, + std::map> &pool_allocs) { + if (!check_context_exists(&state)) { + return -1; + } + + ur_usm_pool_handle_t pool; + ur_usm_pool_desc_t pool_desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr, + UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK}; + ur_result_t res = + urUSMPoolCreate(state.contexts[state.context_num], &pool_desc, &pool); + if (res == UR_RESULT_SUCCESS) { + pool_allocs[pool] = {}; + } + + return 0; +} + +int ur_usm_pool_create_host(TestState &state) { + return pool_create(state, state.pool_host_allocs); +} + +int ur_usm_pool_create_device(TestState &state) { + return pool_create(state, state.pool_device_allocs); +} + +int pool_release( + TestState &state, + std::map> &pool_allocs) { + if (!check_context_exists(&state)) { + return -1; + } + + uint8_t pool_num; + if (get_next_input_data(&state.input, &pool_num) != 0) { + return -1; + } + if (!check_pool_exists(&pool_allocs, pool_num)) { + return -1; + } + + auto &[pool, allocs] = *get_map_item(&pool_allocs, pool_num); + if (allocs.empty()) { + return -1; + } + + ur_result_t res = urUSMPoolRelease(pool); + if (res == UR_RESULT_SUCCESS) { + pool_allocs.erase(pool); + } + + return 0; +} + +int ur_usm_pool_release_host(TestState &state) { + return pool_release(state, state.pool_host_allocs); +} + +int ur_usm_pool_release_device(TestState &state) { + return pool_release(state, state.pool_device_allocs); +} + +int alloc_setup(TestState &state, uint16_t &alloc_size) { + if (!check_context_exists(&state)) { + return -1; + } + + if (get_next_input_data(&state.input, &alloc_size) != 0) { + return -1; + } + + return 0; +} + +int host_alloc(TestState &state, ur_usm_pool_handle_t pool, + std::vector &allocs) { + void *ptr; + uint16_t alloc_size; + ur_result_t res = UR_RESULT_SUCCESS; + + int ret = alloc_setup(state, alloc_size); + if (ret != 0) { + return -1; + } + + auto &context = state.contexts[state.context_num]; + res = urUSMHostAlloc(context, nullptr, pool, alloc_size, &ptr); + if (res == UR_RESULT_SUCCESS) { + allocs.push_back(ptr); + } + + return 0; +} + +int get_alloc_pool( + TestState &state, + std::map> &pool_map, + ur_usm_pool_handle_t pool, std::vector &allocs) { + uint8_t pool_num; + + if (get_next_input_data(&state.input, &pool_num) != 0) { + return -1; + } + if (!check_pool_exists(&pool_map, pool_num)) { + return -1; + } + + auto &[pool_tmp, allocs_tmp] = *get_map_item(&pool_map, pool_num); + pool = pool_tmp; + allocs = allocs_tmp; + + return 0; +} + +int ur_usm_host_alloc_pool(TestState &state) { + ur_usm_pool_handle_t pool = nullptr; + std::vector allocs; + + get_alloc_pool(state, state.pool_host_allocs, pool, allocs); + return host_alloc(state, pool, allocs); +} + +int ur_usm_host_alloc_no_pool(TestState &state) { + return host_alloc(state, nullptr, state.no_pool_host_allocs); +} + +int device_alloc(TestState &state, ur_usm_pool_handle_t pool, + std::vector &allocs) { + void *ptr; + uint16_t alloc_size; + ur_result_t res = UR_RESULT_SUCCESS; + + int ret = alloc_setup(state, alloc_size); + if (ret != 0) { + return -1; + } + + if (!check_device_exists(&state)) { + return -1; + } + + auto &context = state.contexts[state.context_num]; + auto &device = state.devices[state.device_num]; + res = urUSMDeviceAlloc(context, device, nullptr, pool, alloc_size, &ptr); + if (res == UR_RESULT_SUCCESS) { + allocs.push_back(ptr); + } + + return 0; +} + +int ur_usm_device_alloc_pool(TestState &state) { + ur_usm_pool_handle_t pool = nullptr; + std::vector allocs; + + get_alloc_pool(state, state.pool_device_allocs, pool, allocs); + return device_alloc(state, pool, allocs); +} + +int ur_usm_device_alloc_no_pool(TestState &state) { + return device_alloc(state, nullptr, state.no_pool_device_allocs); +} + +int free_pool(TestState &state, + std::map> &pool_map) { + if (pool_map.empty()) { + return -1; + } + + ur_usm_pool_handle_t pool = nullptr; + std::vector allocs; + + int ret = get_alloc_pool(state, pool_map, pool, allocs); + if (ret != 0 || allocs.empty()) { + return -1; + } + + urUSMFree(state.contexts[state.context_num], allocs.back()); + allocs.pop_back(); + + return 0; +} + +int ur_usm_free_host_pool(TestState &state) { + return free_pool(state, state.pool_host_allocs); +} + +int ur_usm_free_device_pool(TestState &state) { + return free_pool(state, state.pool_device_allocs); +} + +int free_no_pool(TestState &state, std::vector allocs) { + if (allocs.empty()) { + return -1; + } + + urUSMFree(state.contexts[state.context_num], allocs.back()); + allocs.pop_back(); + + return 0; +} + +int ur_usm_free_host_no_pool(TestState &state) { + return free_no_pool(state, state.no_pool_host_allocs); +} + +int ur_usm_free_device_no_pool(TestState &state) { + return free_no_pool(state, state.no_pool_device_allocs); +} + +// TODO: Extract API calls to separate functions +int ur_program_create_with_il(TestState &state) { + if (!check_context_exists(&state) || !check_device_exists(&state)) { + return -1; + } + + std::vector il_bin; + ur_program_handle_t program = nullptr; + ur_kernel_handle_t kernel = nullptr; + ur_queue_handle_t queue = nullptr; + ur_event_handle_t event = nullptr; + auto &context = state.contexts[state.context_num]; + auto &device = state.devices[state.device_num]; + std::string kernel_name = + uur::device_binaries::program_kernel_map["bar"][0]; + + load_kernel_source(il_bin); + urProgramCreateWithIL(context, il_bin.data(), il_bin.size(), nullptr, + &program); + urProgramBuild(context, program, nullptr); + urKernelCreate(program, kernel_name.data(), &kernel); + urQueueCreate(context, device, nullptr, &queue); + + const uint32_t nDim = 3; + const size_t gWorkOffset[] = {0, 0, 0}; + const size_t gWorkSize[] = {128, 128, 128}; + + urEnqueueKernelLaunch(queue, kernel, nDim, gWorkOffset, gWorkSize, nullptr, + 0, nullptr, &event); + + urEventWait(1, &event); + urEventRelease(event); + urQueueFinish(queue); + urQueueRelease(queue); + urKernelRelease(kernel); + urProgramRelease(program); + + return 0; +} + +extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) { + int next_api_call; + TestState test_state; + int ret = -1; + + int (*api_wrappers[])(TestState &) = { + ur_platform_get, + ur_device_get, + ur_device_release, + ur_context_create, + ur_context_release, + ur_usm_pool_create_host, + ur_usm_pool_create_device, + ur_usm_pool_release_host, + ur_usm_pool_release_device, + ur_usm_host_alloc_pool, + ur_usm_host_alloc_no_pool, + ur_usm_device_alloc_pool, + ur_usm_device_alloc_no_pool, + ur_usm_free_host_pool, + ur_usm_free_host_no_pool, + ur_usm_free_device_pool, + ur_usm_free_device_no_pool, + ur_program_create_with_il, + }; + + test_state.input = {data, size}; + + ret = init_random_data(&test_state); + if (ret == -1) { + return ret; + } + + urLoaderConfigCreate(&test_state.config); + urLoaderConfigEnableLayer(test_state.config, "UR_LAYER_FULL_VALIDATION"); + ur_result_t res = urInit(0, test_state.config); + if (res != UR_RESULT_SUCCESS) { + return -1; + } + + test_state.adapters.resize(test_state.num_entries); + res = urAdapterGet(test_state.num_entries, test_state.adapters.data(), + &test_state.num_adapters); + if (res != UR_RESULT_SUCCESS || test_state.num_adapters == 0) { + return -1; + } + + while ((next_api_call = get_next_api_call(&test_state.input)) != -1) { + ret = api_wrappers[next_api_call](test_state); + if (ret) { + cleanup(&test_state); + return -1; + } + } + + cleanup(&test_state); + return 0; +} +} // namespace fuzz diff --git a/test/fuzz/utils.hpp b/test/fuzz/utils.hpp new file mode 100644 index 0000000000..330473a4b4 --- /dev/null +++ b/test/fuzz/utils.hpp @@ -0,0 +1,199 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +namespace fuzz { + +enum FuzzerAPICall { + UR_PLATFORM_GET, + UR_DEVICE_GET, + UR_DEVICE_RELEASE, + UR_CONTEXT_CREATE, + UR_CONTEXT_RELEASE, + UR_USM_POOL_CREATE_HOST, + UR_USM_POOL_CREATE_DEVICE, + UR_USM_POOL_RELEASE_HOST, + UR_USM_POOL_RELEASE_DEVICE, + UR_USM_HOST_ALLOC_POOL, + UR_USM_HOST_ALLOC_NO_POOL, + UR_USM_DEVICE_ALLOC_POOL, + UR_USM_DEVICE_ALLOC_NO_POOL, + UR_USM_FREE_HOST_POOL, + UR_USM_FREE_HOST_NO_POOL, + UR_USM_FREE_DEVICE_POOL, + UR_USM_FREE_DEVICE_NO_POOL, + UR_PROGRAM_CREATE_WITH_IL, + UR_MAX_FUZZER_API_CALL, +}; + +typedef struct FuzzerInput { + uint8_t *data_ptr; + size_t data_size; +} FuzzerInput; + +typedef struct TestState { + static constexpr uint32_t num_entries = 1; + + FuzzerInput input; + ur_loader_config_handle_t config; + + std::vector adapters; + std::vector platforms; + std::vector devices; + std::vector contexts; + std::map> pool_host_allocs; + std::map> pool_device_allocs; + std::vector no_pool_host_allocs; + std::vector no_pool_device_allocs; + ur_device_type_t device_type = UR_DEVICE_TYPE_ALL; + + uint32_t num_adapters; + uint32_t num_platforms; + uint32_t num_devices; + + uint8_t platform_num; + uint8_t device_num; + uint8_t context_num; + uint8_t device_type_fuzz; +} TestState; + +////////////////////////////////////////////////////////////////////////////// +template int get_next_input_data(FuzzerInput *input, T *out_data) { + size_t out_data_size = sizeof(out_data); + if (input->data_size == 0 || input->data_size < out_data_size) { + return -1; + } + *out_data = *input->data_ptr; + input->data_ptr += out_data_size; + input->data_size -= out_data_size; + + return 0; +} + +int init_random_data(TestState *state) { + if (state->input.data_size < 5) { + return -1; + } + get_next_input_data(&state->input, &state->platform_num); + get_next_input_data(&state->input, &state->device_num); + get_next_input_data(&state->input, &state->context_num); + get_next_input_data(&state->input, &state->device_type_fuzz); + if (state->device_type_fuzz < 1 || state->device_type_fuzz > 7) { + return -1; + } + state->device_type = static_cast(state->device_type_fuzz); + + return 0; +} + +int get_next_api_call(FuzzerInput *input) { + uint8_t next_api_call; + if (get_next_input_data(input, &next_api_call) != 0) { + return -1; + } + return next_api_call % UR_MAX_FUZZER_API_CALL; +} + +bool check_device_exists(const TestState *state) { + if (state->devices.empty() || state->device_num >= state->devices.size() || + state->devices[0] == nullptr) { + return false; + } + + return true; +} + +bool check_context_exists(const TestState *state) { + if (state->contexts.empty() || + state->context_num >= state->contexts.size() || + state->contexts[0] == nullptr) { + return false; + } + + return true; +} + +bool check_pool_exists( + const std::map> *map, + const uint8_t pool_num) { + if (pool_num >= map->size()) { + return false; + } + + return true; +} + +auto get_map_item(std::map> *map, + const uint8_t item_index) { + auto map_it = map->begin(); + std::advance(map_it, item_index); + + return map_it; +} + +int load_kernel_source(std::vector &binary_out) { + std::string source_path = KERNEL_IL_PATH; + + std::ifstream source_file; + source_file.open(source_path, + std::ios::binary | std::ios::in | std::ios::ate); + if (!source_file.is_open()) { + std::cerr << "Failed to open a kernel source file: " << source_path + << std::endl; + return -1; + } + + size_t source_size = static_cast(source_file.tellg()); + source_file.seekg(0, std::ios::beg); + + std::vector device_binary(source_size); + source_file.read(device_binary.data(), source_size); + if (!source_file) { + source_file.close(); + std::cerr << "failed reading kernel source data from file: " + << source_path << std::endl; + return -1; + } + source_file.close(); + + binary_out = std::vector(std::move(device_binary)); + + return 0; +} + +void cleanup(TestState *state) { + urLoaderConfigRelease(state->config); + + for (auto &[pool, allocs] : state->pool_host_allocs) { + for (auto &alloc : allocs) { + urUSMFree(state->contexts[state->context_num], alloc); + } + urUSMPoolRelease(pool); + } + for (auto &[pool, allocs] : state->pool_device_allocs) { + for (auto &alloc : allocs) { + urUSMFree(state->contexts[state->context_num], alloc); + } + urUSMPoolRelease(pool); + } + for (auto &alloc : state->no_pool_host_allocs) { + urUSMFree(state->contexts[state->context_num], alloc); + } + for (auto &alloc : state->no_pool_device_allocs) { + urUSMFree(state->contexts[state->context_num], alloc); + } + for (auto &context : state->contexts) { + urContextRelease(context); + } + for (auto &device : state->devices) { + urDeviceRelease(device); + } +} +} // namespace fuzz