diff --git a/.github/scripts/get_system_info.sh b/.github/scripts/get_system_info.sh index 6ca38a33ef..8301c99099 100755 --- a/.github/scripts/get_system_info.sh +++ b/.github/scripts/get_system_info.sh @@ -53,7 +53,7 @@ function system_info { echo "**********/proc/meminfo**********" cat /proc/meminfo echo "**********build/bin/urinfo**********" - $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo || true + $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo --no-linear-ids --verbose || true echo "******OpenCL*******" # The driver version of OpenCL Graphics is the compute-runtime version clinfo || echo "OpenCL not installed" diff --git a/CMakeLists.txt b/CMakeLists.txt index f25c8c14ac..7dfc5c017b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) +cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) project(unified-runtime VERSION 0.9.0) include(GNUInstallDirs) diff --git a/README.md b/README.md index db5292a9c3..d7caf11c2b 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ ## Table of contents - [Unified Runtime](#unified-runtime) - - [Adapters](#adapters) - [Table of contents](#table-of-contents) - [Contents of the repo](#contents-of-the-repo) - [Integration](#integration) @@ -29,7 +28,7 @@ - [Adapter naming convention](#adapter-naming-convention) - [Source code generation](#source-code-generation) - [Documentation](#documentation) -6. [Release Process](#release-process) + - [Release Process](#release-process) ## Contents of the repo @@ -88,7 +87,7 @@ for more detailed instructions on the correct setup. Required packages: - C++ compiler with C++17 support -- [CMake](https://cmake.org/) >= 3.14.0 +- [CMake](https://cmake.org/) >= 3.20.0 - Python v3.6.6 or later ### Windows @@ -141,6 +140,7 @@ List of options provided by CMake: | UR_HIP_PLATFORM | Build HIP adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | | UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` | +| UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` | | UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` | | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` | | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` | diff --git a/include/ur_api.h b/include/ur_api.h index 93dd26cd50..3c6dcd75b4 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -222,6 +222,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP = 220, ///< Enumerator for ::urCommandBufferUpdateKernelLaunchExp UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, ///< Enumerator for ::urCommandBufferGetInfoExp UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp + UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1641,6 +1642,7 @@ typedef enum ur_device_info_t { ///< backed 3D sampled image data. UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching ///< non-USM backed 3D sampled image data. + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018, ///< [::ur_bool_t] returns true if the device supports timestamp recording /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1666,7 +1668,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -5618,6 +5620,7 @@ typedef enum ur_command_t { UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp + UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -8890,6 +8893,46 @@ urKernelSuggestMaxCooperativeGroupCountExp( uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings +#if !defined(__GNUC__) +#pragma region enqueue timestamp recording(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10600,6 +10643,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_cooperative_kernel_launch_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueTimestampRecordingExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_timestamp_recording_exp_params_t { + ur_queue_handle_t *phQueue; + bool *pblocking; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_timestamp_recording_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 5a069f0881..dcd8915fa2 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1448,10 +1448,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)( const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueTimestampRecordingExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( + ur_queue_handle_t, + bool, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; + ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/include/ur_print.h b/include/ur_print.h index 3126c1714b..3377980ce7 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1954,6 +1954,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 3d48ae9a35..5472e4b35f 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -916,6 +916,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP: os << "UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP"; break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: + os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -2571,6 +2574,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP"; break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4280,6 +4286,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8788,6 +8806,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP: os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP"; break; + case UR_COMMAND_TIMESTAMP_RECORDING_EXP: + os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -14104,6 +14125,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".blocking = "; + + os << *(params->pblocking); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type /// @returns @@ -17126,6 +17189,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: { os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { + os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: { os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params; } break; diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst new file mode 100644 index 0000000000..3cf46095b2 --- /dev/null +++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst @@ -0,0 +1,70 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-timestamp-recording: + +================================================================================ +Enqueue Timestamp Recording +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Currently, the only way to get timestamp information is through enabling +profiling on a queue and retrieving the information from events coming from +commands submitted to it. However, not all systems give full control of the +queue construction to the programmer wanting the profiling information. To amend +this, this extension adds the ability to enqueue a timestamp recording on any +queue, with or without profiling enabled. This event can in turn be queried for +the usual profiling information. + + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP + +* ${x}_command_t + * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueTimestampRecordingExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Steffen Larsen `steffen.larsen@intel.com `_ diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml new file mode 100644 index 0000000000..18316f734e --- /dev/null +++ b/scripts/core/exp-enqueue-timestamp-recording.yml @@ -0,0 +1,66 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for enqueuing timestamp recordings" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support timestamp recordings." +name: $x_device_info_t +etors: + - name: TIMESTAMP_RECORDING_SUPPORT_EXP + value: "0x2018" + desc: "[$x_bool_t] returns true if the device supports timestamp recording" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Command Type experimental enumerations." +name: $x_command_t +etors: + - name: TIMESTAMP_RECORDING_EXP + value: "0x2002" + desc: Event created by $xEnqueueTimestampRecordingExp +--- #-------------------------------------------------------------------------- +type: function +desc: "Enqueue a command for recording the device timestamp" +class: $xEnqueue +name: TimestampRecordingExp +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: bool + name: blocking + desc: | + [in] indicates whether the call to this function should block until + until the device timestamp recording command has executed on the + device. + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + - type: $x_event_handle_t* + name: phEvent + desc: | + [in,out] return an event object that identifies this particular kernel execution instance. Profiling information can be queried + from this event as if `hQueue` had profiling enabled. Querying `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + reports the timestamp at the time of the call to this function. Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + reports the timestamp recorded when the command is executed on the device. +returns: + - $X_RESULT_ERROR_INVALID_NULL_HANDLE + - $X_RESULT_ERROR_INVALID_NULL_POINTER + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 5ae70f7bd1..6a551d5821 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -580,6 +580,9 @@ etors: - name: COMMAND_BUFFER_COMMAND_GET_INFO_EXP desc: Enumerator for $xCommandBufferCommandGetInfoExp value: '222' +- name: ENQUEUE_TIMESTAMP_RECORDING_EXP + desc: Enumerator for $xEnqueueTimestampRecordingExp + value: '223' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index 84a9e0405b..d83269f2ae 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -10,155 +10,13 @@ #include #include +#include #include "context.hpp" #include "logger/ur_logger.hpp" #include #include -static inline const char *getUrResultString(ur_result_t Result) { - switch (Result) { - case UR_RESULT_SUCCESS: - return "UR_RESULT_SUCCESS"; - case UR_RESULT_ERROR_INVALID_OPERATION: - return "UR_RESULT_ERROR_INVALID_OPERATION"; - case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: - return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; - case UR_RESULT_ERROR_INVALID_QUEUE: - return "UR_RESULT_ERROR_INVALID_QUEUE"; - case UR_RESULT_ERROR_INVALID_VALUE: - return "UR_RESULT_ERROR_INVALID_VALUE"; - case UR_RESULT_ERROR_INVALID_CONTEXT: - return "UR_RESULT_ERROR_INVALID_CONTEXT"; - case UR_RESULT_ERROR_INVALID_PLATFORM: - return "UR_RESULT_ERROR_INVALID_PLATFORM"; - case UR_RESULT_ERROR_INVALID_BINARY: - return "UR_RESULT_ERROR_INVALID_BINARY"; - case UR_RESULT_ERROR_INVALID_PROGRAM: - return "UR_RESULT_ERROR_INVALID_PROGRAM"; - case UR_RESULT_ERROR_INVALID_SAMPLER: - return "UR_RESULT_ERROR_INVALID_SAMPLER"; - case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: - return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; - case UR_RESULT_ERROR_INVALID_MEM_OBJECT: - return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; - case UR_RESULT_ERROR_INVALID_EVENT: - return "UR_RESULT_ERROR_INVALID_EVENT"; - case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; - case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: - return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; - case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; - case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: - return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; - case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: - return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; - case UR_RESULT_ERROR_DEVICE_NOT_FOUND: - return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; - case UR_RESULT_ERROR_INVALID_DEVICE: - return "UR_RESULT_ERROR_INVALID_DEVICE"; - case UR_RESULT_ERROR_DEVICE_LOST: - return "UR_RESULT_ERROR_DEVICE_LOST"; - case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: - return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; - case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: - return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; - case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: - return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; - case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: - return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; - case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; - case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: - return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; - case UR_RESULT_ERROR_INVALID_KERNEL: - return "UR_RESULT_ERROR_INVALID_KERNEL"; - case UR_RESULT_ERROR_INVALID_KERNEL_NAME: - return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; - case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; - case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: - return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; - case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; - case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: - return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; - case UR_RESULT_ERROR_UNINITIALIZED: - return "UR_RESULT_ERROR_UNINITIALIZED"; - case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_RESOURCES: - return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; - case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; - case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; - case UR_RESULT_ERROR_UNSUPPORTED_VERSION: - return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; - case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: - return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; - case UR_RESULT_ERROR_INVALID_ARGUMENT: - return "UR_RESULT_ERROR_INVALID_ARGUMENT"; - case UR_RESULT_ERROR_INVALID_NULL_HANDLE: - return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; - case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: - return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; - case UR_RESULT_ERROR_INVALID_NULL_POINTER: - return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; - case UR_RESULT_ERROR_INVALID_SIZE: - return "UR_RESULT_ERROR_INVALID_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_SIZE: - return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; - case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; - case UR_RESULT_ERROR_INVALID_ENUMERATION: - return "UR_RESULT_ERROR_INVALID_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; - case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: - return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; - case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: - return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; - case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: - return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; - case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; - case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; - case UR_RESULT_ERROR_PROGRAM_UNLINKED: - return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; - case UR_RESULT_ERROR_OVERLAPPING_REGIONS: - return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; - case UR_RESULT_ERROR_INVALID_HOST_PTR: - return "UR_RESULT_ERROR_INVALID_HOST_PTR"; - case UR_RESULT_ERROR_INVALID_USM_SIZE: - return "UR_RESULT_ERROR_INVALID_USM_SIZE"; - case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_ADAPTER_SPECIFIC: - return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; - default: - return "UR_RESULT_ERROR_UNKNOWN"; - } -} - // Trace an internal UR call #define UR_TRACE(Call) \ { \ @@ -173,7 +31,7 @@ static inline const char *getUrResultString(ur_result_t Result) { logger::always("UR ---> {}", #Call); \ Result = (Call); \ if (PrintTrace) \ - logger::always("UR <--- {}({})", #Call, getUrResultString(Result)); \ + logger::always("UR <--- {}({})", #Call, Result); \ } // Handle to a kernel command. diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index cca00c0b85..949b58666e 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -950,6 +950,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA does support fetching 3D non-USM sampled image data. return ReturnValue(true); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + // CUDA supports recording timestamp events. + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 6f99941095..812a41768e 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -1709,3 +1709,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextComputeStream(); + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index ac66bf479e..1e8f2dd384 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); @@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() { UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(cuEventDestroy(EvQueued)); UR_CHECK_ERROR(cuEventDestroy(EvStart)); } @@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 390fd7833a..5ed68f0f25 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -57,6 +57,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } @@ -83,13 +87,14 @@ struct ur_event_handle_t_ { static ur_event_handle_t makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, uint32_t StreamToken = std::numeric_limits::max()) { - const bool ProfilingEnabled = - Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + const bool RequiresTimings = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || + Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr; UR_CHECK_ERROR(cuEventCreate( - &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - if (ProfilingEnabled) { + if (RequiresTimings) { UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); } diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 2ffc0755ee..80fd211863 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -407,6 +407,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index 59d3ba70fd..6faa1e5320 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -33,7 +33,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); + return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment); } auto UMFPool = hPool->HostMemPool.get(); @@ -57,7 +57,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, + return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size, alignment); } @@ -82,8 +82,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, - alignment); + return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0, + /*device flags*/ 0, size, alignment); } auto UMFPool = hPool->SharedMemPool.get(); @@ -132,7 +132,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, } ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_device_mem_flags_t *, + ur_device_handle_t, ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) { try { ScopedContext Active(Context); @@ -151,8 +151,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_device_handle_t, ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) { try { ScopedContext Active(Context); @@ -172,7 +172,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, size_t Size, uint32_t Alignment) { try { ScopedContext Active(Context); @@ -358,19 +358,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp index 2ec3df150f..e5d1f7fbaa 100644 --- a/source/adapters/cuda/usm.hpp +++ b/source/adapters/cuda/usm.hpp @@ -118,15 +118,15 @@ class USMHostMemoryProvider final : public USMMemoryProvider { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment); diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index 50fddc5448..751fde3720 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -10,154 +10,12 @@ #include #include +#include #include "context.hpp" #include #include -static inline const char *getUrResultString(ur_result_t Result) { - switch (Result) { - case UR_RESULT_SUCCESS: - return "UR_RESULT_SUCCESS"; - case UR_RESULT_ERROR_INVALID_OPERATION: - return "UR_RESULT_ERROR_INVALID_OPERATION"; - case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: - return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; - case UR_RESULT_ERROR_INVALID_QUEUE: - return "UR_RESULT_ERROR_INVALID_QUEUE"; - case UR_RESULT_ERROR_INVALID_VALUE: - return "UR_RESULT_ERROR_INVALID_VALUE"; - case UR_RESULT_ERROR_INVALID_CONTEXT: - return "UR_RESULT_ERROR_INVALID_CONTEXT"; - case UR_RESULT_ERROR_INVALID_PLATFORM: - return "UR_RESULT_ERROR_INVALID_PLATFORM"; - case UR_RESULT_ERROR_INVALID_BINARY: - return "UR_RESULT_ERROR_INVALID_BINARY"; - case UR_RESULT_ERROR_INVALID_PROGRAM: - return "UR_RESULT_ERROR_INVALID_PROGRAM"; - case UR_RESULT_ERROR_INVALID_SAMPLER: - return "UR_RESULT_ERROR_INVALID_SAMPLER"; - case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: - return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; - case UR_RESULT_ERROR_INVALID_MEM_OBJECT: - return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; - case UR_RESULT_ERROR_INVALID_EVENT: - return "UR_RESULT_ERROR_INVALID_EVENT"; - case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; - case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: - return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; - case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; - case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: - return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; - case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: - return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; - case UR_RESULT_ERROR_DEVICE_NOT_FOUND: - return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; - case UR_RESULT_ERROR_INVALID_DEVICE: - return "UR_RESULT_ERROR_INVALID_DEVICE"; - case UR_RESULT_ERROR_DEVICE_LOST: - return "UR_RESULT_ERROR_DEVICE_LOST"; - case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: - return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; - case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: - return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; - case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: - return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; - case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: - return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; - case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; - case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: - return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; - case UR_RESULT_ERROR_INVALID_KERNEL: - return "UR_RESULT_ERROR_INVALID_KERNEL"; - case UR_RESULT_ERROR_INVALID_KERNEL_NAME: - return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; - case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; - case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: - return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; - case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; - case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: - return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; - case UR_RESULT_ERROR_UNINITIALIZED: - return "UR_RESULT_ERROR_UNINITIALIZED"; - case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_RESOURCES: - return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; - case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; - case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; - case UR_RESULT_ERROR_UNSUPPORTED_VERSION: - return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; - case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: - return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; - case UR_RESULT_ERROR_INVALID_ARGUMENT: - return "UR_RESULT_ERROR_INVALID_ARGUMENT"; - case UR_RESULT_ERROR_INVALID_NULL_HANDLE: - return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; - case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: - return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; - case UR_RESULT_ERROR_INVALID_NULL_POINTER: - return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; - case UR_RESULT_ERROR_INVALID_SIZE: - return "UR_RESULT_ERROR_INVALID_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_SIZE: - return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; - case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; - case UR_RESULT_ERROR_INVALID_ENUMERATION: - return "UR_RESULT_ERROR_INVALID_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; - case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: - return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; - case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: - return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; - case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: - return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; - case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; - case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; - case UR_RESULT_ERROR_PROGRAM_UNLINKED: - return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; - case UR_RESULT_ERROR_OVERLAPPING_REGIONS: - return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; - case UR_RESULT_ERROR_INVALID_HOST_PTR: - return "UR_RESULT_ERROR_INVALID_HOST_PTR"; - case UR_RESULT_ERROR_INVALID_USM_SIZE: - return "UR_RESULT_ERROR_INVALID_USM_SIZE"; - case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_ADAPTER_SPECIFIC: - return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; - default: - return "UR_RESULT_ERROR_UNKNOWN"; - } -} - // Trace an internal UR call #define UR_TRACE(Call) \ { \ @@ -169,10 +27,10 @@ static inline const char *getUrResultString(ur_result_t Result) { #define UR_CALL(Call, Result) \ { \ if (PrintTrace) \ - fprintf(stderr, "UR ---> %s\n", #Call); \ + std::cerr << "UR ---> " << #Call << "\n"; \ Result = (Call); \ if (PrintTrace) \ - fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + std::cerr << "UR <--- " << #Call << "(" << Result << ")\n"; \ } // Handle to a kernel command. diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index ae4dbe159e..b30ae7e7f3 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -841,6 +841,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMPOSITE_DEVICE: // These two are exclusive of L0. return ReturnValue(0); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(true); // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_GPU_EU_COUNT: diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 79522d4c93..e6e3dd73fa 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -1967,3 +1967,36 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice : hipMemcpyHostToHost)); } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + ScopedContext Active(hQueue->getDevice()); + + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 7478c0e778..5327c43a3b 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr}, EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + bool ProfilingEnabled = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent(); UR_CHECK_ERROR(hipEventCreateWithFlags( &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); @@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(hipEventRecord(EvQueued, 0)); UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get())); @@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() { assert(Queue != nullptr); UR_CHECK_ERROR(hipEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(hipEventDestroy(EvQueued)); UR_CHECK_ERROR(hipEventDestroy(EvStart)); } @@ -244,7 +245,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 50de73b14f..64e8b2d9c8 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -55,6 +55,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 1cf9431fcb..71979b75b1 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -377,6 +377,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 4068c1d865..e871f394f2 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -27,7 +27,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); + return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment); } return umfPoolMallocHelper(hPool, ppMem, size, alignment); @@ -43,7 +43,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, + return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size, alignment); } @@ -60,8 +60,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, - alignment); + return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0, + /*device flags*/ 0, size, alignment); } return umfPoolMallocHelper(hPool, ppMem, size, alignment); @@ -105,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { ScopedContext Active(Device); @@ -120,8 +120,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { ScopedContext Active(Device); @@ -136,7 +136,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, ur_result_t USMHostAllocImpl(void **ResultPtr, [[maybe_unused]] ur_context_handle_t Context, - ur_usm_host_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size)); @@ -309,19 +309,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp index d02145584f..a1c3964263 100644 --- a/source/adapters/hip/usm.hpp +++ b/source/adapters/hip/usm.hpp @@ -118,17 +118,17 @@ class USMHostMemoryProvider final : public USMMemoryProvider { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment); bool checkUSMAlignment(uint32_t &alignment, const ur_usm_desc_t *pUSMDesc); diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 7ae7272355..926d5f4ba4 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -88,7 +88,11 @@ bool setEnvVar(const char *name, const char *value) { ZeUSMImportExtension ZeUSMImport; // This will count the calls to Level-Zero +// TODO: remove the ifdef once +// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented +#ifndef UR_L0_CALL_COUNT_IN_TESTS std::map *ZeCallCount = nullptr; +#endif inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) { switch (ZeError) { diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 19696142f5..85fe582ec3 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -679,6 +679,11 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( if (Queue->hasOpenCommandList(UseCopyEngine)) { if (AllowBatching) { bool batchingAllowed = true; + if (ForcedCmdQueue && + CommandBatch.OpenCommandList->second.ZeQueue != *ForcedCmdQueue) { + // Current open batch doesn't match the forced command queue + batchingAllowed = false; + } if (!UrL0OutOfOrderIntegratedSignalEvent && Queue->Device->isIntegrated()) { batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine, diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 7f9e9b499b..9ed752d46c 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -188,8 +188,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_ATOMIC_64: return ReturnValue( - static_cast(Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); + static_cast(Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); case UR_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: // "Returns a space separated list of extension names (the extension @@ -258,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: return ReturnValue(uint32_t{0}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_LINKER_AVAILABLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { uint32_t MaxComputeUnits = Device->ZeDeviceProperties->numEUsPerSubslice * @@ -410,7 +410,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: return ReturnValue(""); case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: return ReturnValue( size_t{Device->ZeDeviceModuleProperties->printfBufferSize}); @@ -427,7 +427,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(ur_device_exec_capability_flag_t{ UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL}); case UR_DEVICE_INFO_ENDIAN_LITTLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: return ReturnValue(static_cast(Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC)); @@ -604,7 +604,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // TODO: Not supported yet. Needs to be updated after support is added. - return ReturnValue(static_cast(false)); + return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the @@ -790,7 +790,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_VALUE; case UR_DEVICE_INFO_BFLOAT16: { // bfloat16 math functions are not yet supported on Intel GPUs. - return ReturnValue(bool{false}); + return ReturnValue(ur_bool_t{false}); } case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { // There are no explicit restrictions in L0 programming guide, so assume all @@ -839,9 +839,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(capabilities); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(uint32_t{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(uint32_t{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { @@ -853,6 +853,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( 0)); //__read_write attribute currently undefinde in opencl } case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { + return ReturnValue(static_cast(true)); + } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(static_cast(true)); } @@ -893,9 +896,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // can know if we are in (a) or (b) by checking if a tile is root device // or not. ur_device_handle_t URDev = Device->Platform->getDeviceFromNativeHandle(d); - if (URDev->isSubDevice()) + if (URDev->isSubDevice()) { // We are in COMPOSITE mode, return an empty list. - return ReturnValue(0); + if (pSize) { + *pSize = 0; + } + return UR_RESULT_SUCCESS; + } Res.push_back(URDev); } @@ -1078,7 +1085,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() { static const bool UseDriverInOrderLists = [] { const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); if (!UrRet) - return true; + return false; return std::atoi(UrRet) != 0; }(); diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index e472c2490c..615e887172 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -368,8 +368,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( } // Execute each command list so the barriers can be encountered. - for (ur_command_list_ptr_t &CmdList : CmdLists) + for (ur_command_list_ptr_t &CmdList : CmdLists) { + bool IsCopy = + CmdList->second.isCopy(reinterpret_cast(Queue)); + const auto &CommandBatch = + (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // Only batch if the matching CmdList is already open. + OkToBatch = CommandBatch.OpenCommandList == CmdList; + UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch)); + } UR_CALL(Queue->ActiveBarriers.clear()); auto UREvent = reinterpret_cast(*Event); @@ -472,8 +480,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ) { std::shared_lock EventLock(Event->Mutex); - if (Event->UrQueue && - (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { + // The event must either have profiling enabled or be recording timestamps. + bool isTimestampedEvent = Event->isTimestamped(); + if (!Event->isProfilingEnabled() && !isTimestampedEvent) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -486,6 +495,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + // For timestamped events we have the timestamps ready directly on the event + // handle, so we short-circuit the return. + if (isTimestampedEvent) { + uint64_t ContextStartTime = Event->RecordEventStartTimestamp; + switch (PropName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + return ReturnValue(ContextStartTime); + case UR_PROFILING_INFO_COMMAND_END: + case UR_PROFILING_INFO_COMMAND_START: { + // If RecordEventEndTimestamp on the event is non-zero it means it has + // collected the result of the queue already. In that case it has been + // adjusted and is ready for immediate return. + if (Event->RecordEventEndTimestamp) + return ReturnValue(Event->RecordEventEndTimestamp); + + // Otherwise we need to collect it from the queue. + auto Entry = Event->UrQueue->EndTimeRecordings.find(Event); + + // Unexpected state if there is no end-time record. + if (Entry == Event->UrQueue->EndTimeRecordings.end()) + return UR_RESULT_ERROR_UNKNOWN; + auto &EndTimeRecording = Entry->second; + + // End time needs to be adjusted for resolution and valid bits. + uint64_t ContextEndTime = + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * + ZeTimerResolution; + + // If the result is 0, we have not yet gotten results back and so we just + // return it. + if (ContextEndTime == 0) + return ReturnValue(ContextEndTime); + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < ContextStartTime) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Now that we have the result, there is no need to keep it in the queue + // anymore, so we cache it on the event and evict the record from the + // queue. + Event->RecordEventEndTimestamp = ContextEndTime; + Event->UrQueue->EndTimeRecordings.erase(Entry); + + return ReturnValue(ContextEndTime); + } + default: + logger::error("urEventGetProfilingInfo: not supported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + ze_kernel_timestamp_result_t tsResult; // A Command-buffer consists of three command-lists for which only a single @@ -594,6 +658,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + bool Blocking, ///< [in] blocking or non-blocking enqueue + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out] return an event object that identifies + ///< this particular command instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + ur_device_handle_t Device = Queue->Device; + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + /* AllowBatching */ false)); + + UR_CALL(createEventAndAssociateQueue( + Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList, + /* IsInternal */ false, /* HostVisible */ true)); + ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent; + (*OutEvent)->WaitList = TmpWaitList; + + uint64_t DeviceStartTimestamp = 0; + UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); + (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; + + // Create a new entry in the queue's recordings. + Queue->EndTimeRecordings[*OutEvent] = + ur_queue_handle_t_::end_time_recording{}; + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (CommandList->first, + &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp, + ZeEvent, (*OutEvent)->WaitList.Length, + (*OutEvent)->WaitList.ZeEventList)); + + UR_CALL( + Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false)); + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( ze_event_handle_t &ZeHostVisibleEvent) { @@ -903,6 +1024,23 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { Event->Context->addEventToContextCache(Event); } + // If the event was a timestamp recording, we try to evict its entry in the + // queue. + if (Event->isTimestamped()) { + auto Entry = Queue->EndTimeRecordings.find(Event); + if (Entry != Queue->EndTimeRecordings.end()) { + auto &EndTimeRecording = Entry->second; + if (EndTimeRecording.RecordEventEndTimestamp == 0) { + // If the end time recording has not finished, we tell the queue that + // the event is no longer alive to avoid invalid write-backs. + EndTimeRecording.EventHasDied = true; + } else { + // Otherwise we evict the entry. + Event->UrQueue->EndTimeRecordings.erase(Entry); + } + } + } + // We intentionally incremented the reference counter when an event is // created so that we can avoid ur_queue_handle_t is released before the // associated ur_event_handle_t is released. Here we have to decrement it so @@ -1289,16 +1427,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } auto Queue = EventList[I]->UrQueue; - if (Queue) { - // The caller of createAndRetainUrZeEventList must already hold - // a lock of the CurQueue. Additionally lock the Queue if it - // is different from CurQueue. - // TODO: rework this to avoid deadlock when another thread is - // locking the same queues but in a different order. - auto Lock = ((Queue == CurQueue) - ? std::unique_lock() - : std::unique_lock(Queue->Mutex)); + auto CurQueueDevice = CurQueue->Device; + std::optional> QueueLock = + std::nullopt; + // The caller of createAndRetainUrZeEventList must already hold + // a lock of the CurQueue. However, if the CurQueue is different + // then the Event's Queue, we need to drop that lock and + // acquire the Event's Queue lock. This is done to avoid a lock + // ordering issue. + // For the rest of this scope, CurQueue cannot be accessed. + // TODO: This solution is very error-prone. This requires a refactor + // to either have fine-granularity locks inside of the queues or + // to move any operations on queues other than CurQueue out + // of this scope. + if (Queue && Queue != CurQueue) { + CurQueue->Mutex.unlock(); + QueueLock = std::unique_lock(Queue->Mutex); + } + + if (Queue) { // If the event that is going to be waited is in an open batch // different from where this next command is going to be added, // then we have to force execute of that open command-list @@ -1341,7 +1489,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } ur_command_list_ptr_t CommandList; - if (Queue && Queue->Device != CurQueue->Device) { + if (Queue && Queue->Device != CurQueueDevice) { // Get a command list prior to acquiring an event lock. // This prevents a potential deadlock with recursive // event locks. @@ -1351,7 +1499,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( std::shared_lock Lock(EventList[I]->Mutex); - if (Queue && Queue->Device != CurQueue->Device && + if (Queue && Queue->Device != CurQueueDevice && !EventList[I]->IsMultiDevice) { ze_event_handle_t MultiDeviceZeEvent = nullptr; ur_event_handle_t MultiDeviceEvent; @@ -1386,6 +1534,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList[TmpListLength]->RefCount.increment(); } + if (QueueLock.has_value()) { + QueueLock.reset(); + CurQueue->Mutex.lock(); + } TmpListLength += 1; } } @@ -1464,3 +1616,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const { return !UrQueue || // tentatively assume user events are profiling enabled (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; } + +// Tells if this event was created as a timestamp event, allowing profiling +// info even if profiling is not enabled. +bool ur_event_handle_t_::isTimestamped() const { + // If we are recording, the start time of the event will be non-zero. The + // end time might still be missing, depending on whether the corresponding + // enqueue is still running. + return RecordEventStartTimestamp != 0; +} diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index 08b4be7969..2d1f536e4e 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -207,6 +207,14 @@ struct ur_event_handle_t_ : _ur_object { // Indicates within creation of proxy event. bool IsCreatingHostProxyEvent = {false}; + // Indicates the recorded start and end timestamps for the event. These are + // only set for events returned by timestamp recording enqueue functions. + // A non-zero value for RecordEventStartTimestamp indicates the event was the + // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it + // means the event has fetched the end-timestamp from the queue. + uint64_t RecordEventStartTimestamp = 0; + uint64_t RecordEventEndTimestamp = 0; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced @@ -231,6 +239,10 @@ struct ur_event_handle_t_ : _ur_object { // Tells if this event is with profiling capabilities. bool isProfilingEnabled() const; + // Tells if this event was created as a timestamp event, allowing profiling + // info even if profiling is not enabled. + bool isTimestamped() const; + // Get the host-visible event or create one and enqueue its signal. ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index ab2277d8b7..ad48962375 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1514,8 +1514,7 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { } ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { - if (!Event->IsMultiDevice && Event->UrQueue) { - auto Device = Event->UrQueue->Device; + if (!Event->IsMultiDevice) { auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; if (EventCachesMap->find(Device) == EventCachesMap->end()) { @@ -1542,6 +1541,34 @@ ur_result_t ur_queue_handle_t_::active_barriers::clear() { return UR_RESULT_SUCCESS; } +void ur_queue_handle_t_::clearEndTimeRecordings() { + uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxValue = + ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); + + for (auto Entry : EndTimeRecordings) { + auto &Event = Entry.first; + auto &EndTimeRecording = Entry.second; + if (!Entry.second.EventHasDied) { + // Write the result back to the event if it is not dead. + uint64_t ContextEndTime = + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * + ZeTimerResolution; + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < Event->RecordEventStartTimestamp) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Store it in the event. + Event->RecordEventEndTimestamp = ContextEndTime; + } + } + EndTimeRecordings.clear(); +} + ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { ur_queue_handle_t UrQueue = reinterpret_cast(Queue); @@ -1567,6 +1594,8 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { } } + Queue->clearEndTimeRecordings(); + logger::debug("urQueueRelease(compute) NumTimesClosedFull {}, " "NumTimesClosedEarly {}", UrQueue->ComputeCommandBatch.NumTimesClosedFull, @@ -1720,6 +1749,11 @@ ur_result_t ur_queue_handle_t_::synchronize() { LastCommandEvent = nullptr; } + // Since all timestamp recordings should have finished with the + // synchronizations, we can clear the map and write the results to the owning + // events. + clearEndTimeRecordings(); + // With the entire queue synchronized, the active barriers must be done so we // can remove them. if (auto Res = ActiveBarriers.clear()) diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index ed832148ac..799e90e9d9 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -486,6 +486,22 @@ struct ur_queue_handle_t_ : _ur_object { std::vector> EventCachesDeviceMap{2}; + // End-times enqueued are stored on the queue rather than on the event to + // avoid the event objects having been destroyed prior to the write to the + // end-time member. + struct end_time_recording { + // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as + // it is written asynchronously. + uint64_t RecordEventEndTimestamp = 0; + // The event may die before the recording has been written back. In this + // case the event will mark this for deletion when the queue sees fit. + bool EventHasDied = false; + }; + std::map EndTimeRecordings; + + // Clear the end time recording timestamps entries. + void clearEndTimeRecordings(); + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 051db73145..d6d6060ea6 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -456,6 +456,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index c4cbfc9d26..19d31bcb9b 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -171,7 +171,7 @@ static ur_result_t USMAllocationMakeResident( static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment) { std::ignore = Flags; // TODO: translate PI properties to Level Zero flags @@ -213,12 +213,10 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, return UR_RESULT_SUCCESS; } -static ur_result_t USMSharedAllocImpl(void **ResultPtr, - ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment) { +static ur_result_t +USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeHostDesc; @@ -263,7 +261,7 @@ static ur_result_t USMSharedAllocImpl(void **ResultPtr, static ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment) { std::ignore = Flags; // TODO: translate PI properties to Level Zero flags @@ -767,8 +765,8 @@ umf_result_t L0MemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t L0SharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr, @@ -776,20 +774,19 @@ ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr, uint32_t Alignment) { ur_usm_device_desc_t UsmDeviceDesc{}; UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - ur_usm_host_desc_t UsmHostDesc{}; - return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags, - &UsmHostDesc.flags, Size, Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, UsmDeviceDesc.flags, + /*host flags*/ 0, Size, Alignment); } ur_result_t L0DeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t L0HostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 1babdb0f10..e7a3e8adf1 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -321,6 +321,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(false); default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 112bb553c0..9049e3c1b6 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -87,3 +87,15 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = blocking; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + DIE_NO_IMPLEMENTATION; +} diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 1f54a98c80..065012613e 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -392,6 +392,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( } pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 37d0ceb15a..a004386baa 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5513,6 +5513,47 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnTimestampRecordingExp = + d_context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr != pfnTimestampRecordingExp) { + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + // generic implementation + *phEvent = reinterpret_cast(d_context.get()); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6069,6 +6110,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = driver::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + driver::urEnqueueTimestampRecordingExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index d89a9492a5..ac79b71876 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -507,7 +507,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, cl_adapter::cast(hDevice), {"cl_khr_fp16"}, Supported)); if (!Supported) { - return UR_RESULT_ERROR_INVALID_ENUMERATION; + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } @@ -799,6 +799,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { return ReturnValue(false); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + return ReturnValue(false); + } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: { bool Supported = false; CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp index f5af30734c..5141ce8ff0 100644 --- a/source/adapters/opencl/event.cpp +++ b/source/adapters/opencl/event.cpp @@ -257,3 +257,9 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, CallbackType, ClCallback, Callback)); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 8c2c73d7c8..effb2128c3 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -398,6 +398,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index 9cecdbec1e..f588a21a2e 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -294,7 +294,7 @@ template struct stype_map {}; // stype_map_impl {}; #include "stype_map_helpers.def" -template constexpr int as_stype() { return stype_map::value; }; +template constexpr int as_stype() { return stype_map::value; } /// Walk a generic UR linked list looking for a node of the given type. If it's /// found, its address is returned, othewise `nullptr`. e.g. to find out whether diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index c55d752410..b05d7c6348 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -22,8 +22,6 @@ namespace ur_sanitizer_layer { namespace { -constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem"; - uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) { return USM_SHADOW_BASE + (UPtr >> 3); } @@ -348,11 +346,14 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context, ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); + auto KernelInfo = getKernelInfo(Kernel); + + UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get())); ManagedQueue InternalQueue(Context, Device); if (!InternalQueue) { @@ -370,23 +371,12 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo) { - auto Program = GetProgram(Kernel); - ur_event_handle_t ReadEvent{}; - - // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it - // to host, but it's okay that it isn't defined + USMLaunchInfo &LaunchInfo) { // FIXME: We must use block operation here, until we support urEventSetCallback - auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead( - Queue, Program, kSPIR_DeviceSanitizerReportMem, true, - sizeof(LaunchInfo.SPIR_DeviceSanitizerReportMem), 0, - &LaunchInfo.SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent); + auto Result = context.urDdiTable.Queue.pfnFinish(Queue); if (Result == UR_RESULT_SUCCESS) { - Event = ReadEvent; - - const auto &AH = LaunchInfo.SPIR_DeviceSanitizerReportMem; + const auto &AH = LaunchInfo.Data->SanitizerReport; if (!AH.Flag) { return UR_RESULT_SUCCESS; } @@ -627,13 +617,44 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) { return UR_RESULT_SUCCESS; } +ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { + return UR_RESULT_SUCCESS; + } + m_KernelMap.emplace(Kernel, std::make_shared(Kernel)); + return UR_RESULT_SUCCESS; +} + +ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + m_KernelMap.erase(Kernel); + return UR_RESULT_SUCCESS; +} + ur_result_t SanitizerInterceptor::prepareLaunch( ur_context_handle_t Context, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Program = GetProgram(Kernel); do { + // Set launch info argument + auto ArgNums = GetKernelNumArgs(Kernel); + if (ArgNums) { + context.logger.debug( + "launch_info {} (numLocalArgs={}, localArgs={})", + (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs, + (void *)LaunchInfo.Data->LocalArgs); + ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer( + Kernel, ArgNums - 1, nullptr, &LaunchInfo.Data); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error("Failed to set launch info: {}", URes); + return URes; + } + } + // Write global variable to program auto EnqueueWriteGlobal = [Queue, Program](const char *Name, const void *Value, @@ -723,15 +744,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch( "LocalShadowMemorySize={})", NumWG, LocalMemorySize, LocalShadowMemorySize); - UR_CALL(EnqueueAllocateDevice(LocalShadowMemorySize, - LaunchInfo.LocalShadowOffset)); + UR_CALL(EnqueueAllocateDevice( + LocalShadowMemorySize, LaunchInfo.Data->LocalShadowOffset)); - LaunchInfo.LocalShadowOffsetEnd = - LaunchInfo.LocalShadowOffset + LocalShadowMemorySize - 1; + LaunchInfo.Data->LocalShadowOffsetEnd = + LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize - + 1; - context.logger.info("ShadowMemory(Local, {} - {})", - (void *)LaunchInfo.LocalShadowOffset, - (void *)LaunchInfo.LocalShadowOffsetEnd); + context.logger.info( + "ShadowMemory(Local, {} - {})", + (void *)LaunchInfo.Data->LocalShadowOffset, + (void *)LaunchInfo.Data->LocalShadowOffsetEnd); } } } while (false); @@ -749,15 +772,61 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) { return --It; } -LaunchInfo::~LaunchInfo() { +ur_result_t USMLaunchInfo::initialize() { + UR_CALL(context.urDdiTable.Context.pfnRetain(Context)); + UR_CALL(context.urDdiTable.Device.pfnRetain(Device)); + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data)); + *Data = LaunchInfo{}; + return UR_RESULT_SUCCESS; +} + +ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) { + auto NumArgs = KI.LocalArgs.size(); + if (NumArgs) { + Data->NumLocalArgs = NumArgs; + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs, + (void **)&Data->LocalArgs)); + uint32_t i = 0; + for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) { + Data->LocalArgs[i++] = ArgInfo; + context.logger.debug( + "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex, + ArgInfo.Size, ArgInfo.SizeWithRedZone); + } + } + return UR_RESULT_SUCCESS; +} + +USMLaunchInfo::~USMLaunchInfo() { [[maybe_unused]] ur_result_t Result; - if (LocalShadowOffset) { - Result = - context.urDdiTable.USM.pfnFree(Context, (void *)LocalShadowOffset); + if (Data) { + auto Type = GetDeviceType(Device); + if (Type == DeviceType::GPU_PVC) { + if (Data->PrivateShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->PrivateShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + if (Data->LocalShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->LocalShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + } + if (Data->LocalArgs) { + Result = context.urDdiTable.USM.pfnFree(Context, + (void *)Data->LocalArgs); + assert(Result == UR_RESULT_SUCCESS); + } + Result = context.urDdiTable.USM.pfnFree(Context, (void *)Data); assert(Result == UR_RESULT_SUCCESS); } Result = context.urDdiTable.Context.pfnRelease(Context); assert(Result == UR_RESULT_SUCCESS); + Result = context.urDdiTable.Device.pfnRelease(Device); + assert(Result == UR_RESULT_SUCCESS); } } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index a691bee7b7..1a699df1f6 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -79,6 +79,26 @@ struct QueueInfo { } }; +struct KernelInfo { + ur_kernel_handle_t Handle; + + ur_shared_mutex Mutex; + // Need preserve the order of local arguments + std::map LocalArgs; + + explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRetain(Kernel); + assert(Result == UR_RESULT_SUCCESS); + } + + ~KernelInfo() { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRelease(Handle); + assert(Result == UR_RESULT_SUCCESS); + } +}; + struct ContextInfo { ur_context_handle_t Handle; @@ -107,31 +127,30 @@ struct ContextInfo { } }; -struct LaunchInfo { - uptr LocalShadowOffset = 0; - uptr LocalShadowOffsetEnd = 0; - DeviceSanitizerReport SPIR_DeviceSanitizerReportMem; +struct USMLaunchInfo { + LaunchInfo *Data; ur_context_handle_t Context = nullptr; + ur_device_handle_t Device = nullptr; const size_t *GlobalWorkSize = nullptr; const size_t *GlobalWorkOffset = nullptr; std::vector LocalWorkSize; uint32_t WorkDim = 0; - LaunchInfo(ur_context_handle_t Context, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, const size_t *GlobalWorkOffset, - uint32_t WorkDim) - : Context(Context), GlobalWorkSize(GlobalWorkSize), + USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + const size_t *GlobalWorkOffset, uint32_t WorkDim) + : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize), GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) { - [[maybe_unused]] auto Result = - context.urDdiTable.Context.pfnRetain(Context); - assert(Result == UR_RESULT_SUCCESS); if (LocalWorkSize) { this->LocalWorkSize = std::vector(LocalWorkSize, LocalWorkSize + WorkDim); } } - ~LaunchInfo(); + ~USMLaunchInfo(); + + ur_result_t initialize(); + ur_result_t updateKernelInfo(const KernelInfo &KI); }; struct DeviceGlobalInfo { @@ -158,12 +177,11 @@ class SanitizerInterceptor { ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t insertContext(ur_context_handle_t Context, std::shared_ptr &CI); @@ -173,6 +191,9 @@ class SanitizerInterceptor { std::shared_ptr &CI); ur_result_t eraseDevice(ur_device_handle_t Device); + ur_result_t insertKernel(ur_kernel_handle_t Kernel); + ur_result_t eraseKernel(ur_kernel_handle_t Kernel); + std::optional findAllocInfoByAddress(uptr Address); std::shared_ptr getContextInfo(ur_context_handle_t Context) { @@ -181,6 +202,18 @@ class SanitizerInterceptor { return m_ContextMap[Context]; } + std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { + std::shared_lock Guard(m_DeviceMapMutex); + assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); + return m_DeviceMap[Device]; + } + + std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { + std::shared_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + return m_KernelMap[Kernel]; + } + private: ur_result_t updateShadowMemory(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, @@ -195,26 +228,23 @@ class SanitizerInterceptor { std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); - std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { - std::shared_lock Guard(m_DeviceMapMutex); - assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); - return m_DeviceMap[Device]; - } - private: std::unordered_map> m_ContextMap; ur_shared_mutex m_ContextMapMutex; - std::unordered_map> m_DeviceMap; ur_shared_mutex m_DeviceMapMutex; + std::unordered_map> + m_KernelMap; + ur_shared_mutex m_KernelMapMutex; + /// Assumption: all USM chunks are allocated in one VA AllocationMap m_AllocationMap; ur_shared_mutex m_AllocationMapMutex; diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp index 46ddee4423..1c8ef24a9d 100644 --- a/source/loader/layers/sanitizer/asan_libdevice.hpp +++ b/source/loader/layers/sanitizer/asan_libdevice.hpp @@ -62,6 +62,23 @@ struct DeviceSanitizerReport { bool IsRecover = false; }; +struct LocalArgsInfo { + uint64_t Size = 0; + uint64_t SizeWithRedZone = 0; +}; + +struct LaunchInfo { + uintptr_t PrivateShadowOffset = + 0; // don't move this field, we use it in AddressSanitizerPass + + uintptr_t LocalShadowOffset = 0; + uintptr_t LocalShadowOffsetEnd = 0; + DeviceSanitizerReport SanitizerReport; + + uint32_t NumLocalArgs = 0; + LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex +}; + constexpr unsigned ASAN_SHADOW_SCALE = 3; constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE; diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp index d5612100aa..1d43c512da 100644 --- a/source/loader/layers/sanitizer/common.hpp +++ b/source/loader/layers/sanitizer/common.hpp @@ -65,6 +65,41 @@ inline constexpr uptr ComputeRZLog(uptr user_requested_size) { return rz_log; } +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// Examples: +/// \code +/// alignTo(5, 8) = 8 +/// alignTo(17, 8) = 24 +/// alignTo(~0LL, 8) = 0 +/// alignTo(321, 255) = 510 +/// \endcode +inline uint64_t AlignTo(uint64_t Value, uint64_t Align) { + assert(Align != 0u && "Align can't be 0."); + return (Value + Align - 1) / Align * Align; +} + +inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size, + uint64_t Granularity, + uint64_t Alignment) { + uint64_t Res = 0; + if (Size <= 4) { + Res = 16; + } else if (Size <= 16) { + Res = 32; + } else if (Size <= 128) { + Res = Size + 32; + } else if (Size <= 512) { + Res = Size + 64; + } else if (Size <= 4096) { + Res = Size + 128; + } else { + Res = Size + 256; + } + return AlignTo(std::max(Res, 2 * Granularity), Alignment); +} + // ================================================================ // Trace an internal UR call; returns in case of an error. diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 53ce5d1c1e..58f54c9338 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -272,8 +272,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( context.logger.debug("==== urEnqueueKernelLaunch"); - LaunchInfo LaunchInfo(GetContext(hQueue), pGlobalWorkSize, pLocalWorkSize, - pGlobalWorkOffset, workDim); + USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + workDim); + UR_CALL(LaunchInfo.initialize()); UR_CALL(context.interceptor->preLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -283,8 +285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent); if (result == UR_RESULT_SUCCESS) { - UR_CALL(context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent, - LaunchInfo)); + UR_CALL( + context.interceptor->postLaunchKernel(hKernel, hQueue, LaunchInfo)); } if (phEvent) { @@ -374,6 +376,90 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelCreate +__urdlllocal ur_result_t UR_APICALL urKernelCreate( + ur_program_handle_t hProgram, ///< [in] handle of the program instance + const char *pKernelName, ///< [in] pointer to null-terminated string. + ur_kernel_handle_t + *phKernel ///< [out] pointer to handle of kernel object created. +) { + auto pfnCreate = context.urDdiTable.Kernel.pfnCreate; + + if (nullptr == pfnCreate) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelCreate"); + + UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); + UR_CALL(context.interceptor->insertKernel(*phKernel)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelRelease +__urdlllocal ur_result_t urKernelRelease( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release +) { + auto pfnRelease = context.urDdiTable.Kernel.pfnRelease; + + if (nullptr == pfnRelease) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelRelease"); + UR_CALL(pfnRelease(hKernel)); + + if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) { + uint32_t RefCount; + UR_CALL(context.urDdiTable.Kernel.pfnGetInfo( + hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount), + &RefCount, nullptr)); + if (RefCount == 1) { + UR_CALL(context.interceptor->eraseKernel(hKernel)); + } + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgLocal +__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + size_t + argSize, ///< [in] size of the local buffer to be allocated by the runtime + const ur_kernel_arg_local_properties_t + *pProperties ///< [in][optional] pointer to local buffer properties. +) { + auto pfnSetArgLocal = context.urDdiTable.Kernel.pfnSetArgLocal; + + if (nullptr == pfnSetArgLocal) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelSetArgLocal (argIndex={}, argSize={})", + argIndex, argSize); + + { + auto KI = context.interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KI->Mutex); + // TODO: get local variable alignment + auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal( + argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY); + KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ}; + argSize = argSizeWithRZ; + } + + ur_result_t result = + pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Context table /// with current process' addresses @@ -470,6 +556,38 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's Kernel table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_kernel_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::context.version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate; + pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease; + pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + + return result; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Enqueue table /// with current process' addresses @@ -570,6 +688,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Context); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetKernelProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Kernel); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetProgramProcAddrTable( UR_API_VERSION_CURRENT, &dditable->Program); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 771b46c0c0..6b7be288a3 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -6121,6 +6121,52 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_timestamp_recording_exp_params_t params = { + &hQueue, &blocking, &numEventsInWaitList, &phEventWaitList, &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms); + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms, &result, + instance); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6828,6 +6874,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_tracing_layer::urEnqueueTimestampRecordingExp; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 7939ca21b9..cc11494804 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName) { + if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -8936,6 +8936,65 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -9745,6 +9804,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_validation_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_validation_layer::urEnqueueTimestampRecordingExp; + return result; } diff --git a/source/loader/ur_adapter_registry.hpp b/source/loader/ur_adapter_registry.hpp index 3cfac34647..060a5ae8a9 100644 --- a/source/loader/ur_adapter_registry.hpp +++ b/source/loader/ur_adapter_registry.hpp @@ -114,11 +114,13 @@ class AdapterRegistry { // to load the adapter. std::vector> adaptersLoadPaths; - static constexpr std::array knownAdapterNames{ + static constexpr std::array knownAdapterNames{ MAKE_LIBRARY_NAME("ur_adapter_level_zero", "0"), - MAKE_LIBRARY_NAME("ur_adapter_hip", "0"), MAKE_LIBRARY_NAME("ur_adapter_opencl", "0"), - MAKE_LIBRARY_NAME("ur_adapter_cuda", "0")}; + MAKE_LIBRARY_NAME("ur_adapter_cuda", "0"), + MAKE_LIBRARY_NAME("ur_adapter_hip", "0"), + MAKE_LIBRARY_NAME("ur_adapter_native_cpu", "0"), + }; std::optional> getEnvAdapterSearchPaths() { std::optional> pathStringsOpt; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index ffb7eec027..72e8014fe3 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7653,6 +7653,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnTimestampRecordingExp = + dditable->ur.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitListLocal.data(), phEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phEvent = reinterpret_cast( + ur_event_factory.getInstance(*phEvent, dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -8373,6 +8436,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( // return pointers to loader's DDIs pDdiTable->pfnCooperativeKernelLaunchExp = ur_loader::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + ur_loader::urEnqueueTimestampRecordingExp; } else { // return pointers directly to platform's DDIs *pDdiTable = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index dba668e61b..baa8173976 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8315,6 +8315,52 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + auto pfnTimestampRecordingExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 5721ebf3ef..3b144c87ad 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1653,6 +1653,14 @@ ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueTimestampRecordingExpParams( + const struct ur_enqueue_timestamp_recording_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 7f4746fcb7..d21d81411f 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7034,6 +7034,44 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/test/adapters/CMakeLists.txt b/test/adapters/CMakeLists.txt index 5eff6e357a..4ce7d2505d 100644 --- a/test/adapters/CMakeLists.txt +++ b/test/adapters/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index 45d68594ed..678d26bfe8 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -25,3 +25,22 @@ else() add_dependencies(test-adapter-level_zero generate_device_binaries kernel_names_header) endif() + +if(LINUX) + # Make L0 use CallMap from a seprate shared lib so that we can access the map + # from the tests. This only seems to work on linux + add_library(zeCallMap SHARED zeCallMap.cpp) + target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS) + target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap) + + add_adapter_test(level_zero + FIXTURE DEVICES + SOURCES + event_cache_tests.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "UR_L0_LEAKS_DEBUG=1" + ) + + target_link_libraries(test-adapter-level_zero PRIVATE zeCallMap) +endif() diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp new file mode 100644 index 0000000000..53bc39ad96 --- /dev/null +++ b/test/adapters/level_zero/event_cache_tests.cpp @@ -0,0 +1,163 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ur_print.hpp" +#include "uur/fixtures.h" +#include "uur/raii.h" + +#include +#include + +template auto combineFlags(std::tuple tuple) { + return std::apply([](auto... args) { return (... |= args); }, tuple); +} + +extern std::map *ZeCallCount; + +using FlagsTupleType = std::tuple; + +struct urEventCacheTest : uur::urContextTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + + flags = combineFlags(getParam()); + + ur_queue_properties_t props; + props.flags = flags; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); + ASSERT_NE(queue, nullptr); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size, + nullptr, &buffer)); + + (*ZeCallCount)["zeEventCreate"] = 0; + (*ZeCallCount)["zeEventDestroy"] = 0; + } + + void TearDown() override { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + if (queue) { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueRelease(queue)); + } + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + } + + auto enqueueWork(ur_event_handle_t *hEvent, int data) { + input.assign(count, data); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferWrite( + queue, buffer, false, 0, size, input.data(), 0, nullptr, hEvent)); + } + + void verifyData() { + std::vector output(count, 1); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferRead( + queue, buffer, true, 0, size, output.data(), 0, nullptr, nullptr)); + + if (!(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { + ASSERT_EQ(input, output); + } + } + + const size_t count = 1024; + const size_t size = sizeof(uint32_t) * count; + ur_mem_handle_t buffer = nullptr; + ur_queue_handle_t queue = nullptr; + std::vector input; + ur_queue_flags_t flags; +}; + +TEST_P(urEventCacheTest, eventsReuseNoVisibleEvent) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + + for (int i = 0; i < numIters; i++) { + for (int j = 0; j < numEnqueues; j++) { + enqueueWork(nullptr, i * numEnqueues + j); + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + verifyData(); + } + + // TODO: why events are not reused for UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE? + if ((flags & UR_QUEUE_FLAG_DISCARD_EVENTS) && + !(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { + ASSERT_EQ((*ZeCallCount)["zeEventCreate"], 2); + } else { + ASSERT_GE((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues); + } +} + +TEST_P(urEventCacheTest, eventsReuseWithVisibleEvent) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + + for (int i = 0; i < numIters; i++) { + std::vector events(numEnqueues); + for (int j = 0; j < numEnqueues; j++) { + enqueueWork(events[j].ptr(), i * numEnqueues + j); + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + verifyData(); + } + + ASSERT_LT((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues); +} + +TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + static constexpr int waitEveryN = 16; + + for (int i = 0; i < numIters; i++) { + std::vector events; + for (int j = 0; j < numEnqueues; j++) { + events.emplace_back(); + enqueueWork(events.back().ptr(), i * numEnqueues + j); + + if (j > 0 && j % waitEveryN == 0) { + ASSERT_SUCCESS(urEventWait(waitEveryN, + (ur_event_handle_t *)events.data())); + verifyData(); + events.clear(); + } + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + } + + ASSERT_GE((*ZeCallCount)["zeEventCreate"], waitEveryN); + // TODO: why there are more events than this? + // ASSERT_LE((*ZeCallCount)["zeEventCreate"], waitEveryN * 2 + 2); +} + +template +inline std::string +printFlags(const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + auto flags = combineFlags(std::get<1>(info.param)); + + std::stringstream ss; + ur::details::printFlag(ss, flags); + + auto str = ss.str(); + std::replace(str.begin(), str.end(), ' ', '_'); + std::replace(str.begin(), str.end(), '|', '_'); + return platform_device_name + "__" + str; +} + +UUR_TEST_SUITE_P( + urEventCacheTest, + ::testing::Combine( + testing::Values(0, UR_QUEUE_FLAG_DISCARD_EVENTS), + testing::Values(0, UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + // TODO: why the test fails with UR_QUEUE_FLAG_SUBMISSION_BATCHED? + testing::Values( + UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE /*, UR_QUEUE_FLAG_SUBMISSION_BATCHED */), + testing::Values(0, UR_QUEUE_FLAG_PROFILING_ENABLE)), + printFlags); diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp new file mode 100644 index 0000000000..3c6487f36d --- /dev/null +++ b/test/adapters/level_zero/zeCallMap.cpp @@ -0,0 +1,12 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +// Map used by L0 adapter to count the number of calls to each L0 function +// Lifetime is managed by the adapter, this variable is defined here +// only so that we can read it from the tests. +std::map *ZeCallCount = nullptr; diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index a11cc82932..31048cce28 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -33,7 +33,7 @@ function(add_test_adapter name adapter) endif() set(TEST_ENV UR_ADAPTERS_FORCE_LOAD="$") - if(NOT UR_CONFORMANCE_TEST_DIR) + if(UR_CONFORMANCE_ENABLE_MATCH_FILES) list(APPEND TEST_ENV GTEST_COLOR=no) endif() set_tests_properties(${TEST_NAME} PROPERTIES @@ -135,6 +135,7 @@ if(UR_DPCXX) add_subdirectory(kernel) add_subdirectory(program) add_subdirectory(enqueue) + add_subdirectory(integration) add_subdirectory(exp_command_buffer) add_subdirectory(exp_usm_p2p) else() diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match index b1bff7376c..96a29ca0f5 100644 --- a/test/conformance/device/device_adapter_level_zero.match +++ b/test/conformance/device/device_adapter_level_zero.match @@ -1,18 +1,9 @@ -{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ENDIAN_LITTLE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_COMPILER_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_LINKER_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IMAGE_SRGB urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_64 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP diff --git a/test/conformance/device/device_adapter_opencl.match b/test/conformance/device/device_adapter_opencl.match index 716ebd54fe..e69de29bb2 100644 --- a/test/conformance/device/device_adapter_opencl.match +++ b/test/conformance/device/device_adapter_opencl.match @@ -1 +0,0 @@ -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HALF_FP_CONFIG diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index 26358d49f6..5b6b99004f 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -10,6 +10,10 @@ else() set(AMD_ARCH "${UR_CONFORMANCE_AMD_ARCH}") endif() +cmake_path(GET UR_DPCXX EXTENSION EXE) +cmake_path(REPLACE_FILENAME UR_DPCXX "clang-offload-extract${EXE}" OUTPUT_VARIABLE DEFAULT_EXTRACTOR_NAME) +set(UR_DEVICE_CODE_EXTRACTOR "${DEFAULT_EXTRACTOR_NAME}" CACHE PATH "Path to clang-offload-extract") + if("${AMD_ARCH}" STREQUAL "" AND "${TARGET_TRIPLES}" MATCHES "amd") find_package(RocmAgentEnumerator) if(NOT ROCM_AGENT_ENUMERATOR_FOUND) @@ -59,6 +63,8 @@ macro(add_device_binary SOURCE_FILE) foreach(TRIPLE ${TARGET_TRIPLES}) set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}") + set(BIN_PATH "${DEVICE_BINARY_DIR}/${TRIPLE}.bin.0") + if(${TRIPLE} MATCHES "amd") set(AMD_TARGET_BACKEND -Xsycl-target-backend=${TRIPLE}) set(AMD_OFFLOAD_ARCH --offload-arch=${AMD_ARCH}) @@ -81,17 +87,17 @@ macro(add_device_binary SOURCE_FILE) continue() endif() - add_custom_command(OUTPUT ${EXE_PATH} + add_custom_command(OUTPUT "${BIN_PATH}" COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB} ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH} - COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true - ${EXE_PATH} || exit 0 + COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} ${UR_DEVICE_CODE_EXTRACTOR} --stem="${TRIPLE}.bin" ${EXE_PATH} + WORKING_DIRECTORY "${DEVICE_BINARY_DIR}" DEPENDS ${SOURCE_FILE} ) - add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${EXE_PATH}) + add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${BIN_PATH}) add_dependencies(generate_device_binaries generate_${KERNEL_NAME}_${TRIPLE}) endforeach() list(APPEND DEVICE_CODE_SOURCES ${SOURCE_FILE}) @@ -106,6 +112,9 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/multiply.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant_multiple.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/usm_ll.cpp) diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp new file mode 100644 index 0000000000..a2bdaccf55 --- /dev/null +++ b/test/conformance/device_code/cpy_and_mult.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + size_t array_size = 16; + cl::sycl::queue sycl_queue; + std::vector src(array_size, 1); + std::vector dst(array_size, 1); + auto src_buff = + cl::sycl::buffer(src.data(), cl::sycl::range<1>(array_size)); + auto dst_buff = + cl::sycl::buffer(dst.data(), cl::sycl::range<1>(array_size)); + + sycl_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc = src_buff.get_access(cgh); + auto dst_acc = dst_buff.get_access(cgh); + cgh.parallel_for( + cl::sycl::range<1>{array_size}, + [src_acc, dst_acc](cl::sycl::item<1> itemId) { + auto id = itemId.get_id(0); + dst_acc[id] = src_acc[id] * 2; + }); + }); + return 0; +} diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp new file mode 100644 index 0000000000..e253dfe14b --- /dev/null +++ b/test/conformance/device_code/cpy_and_mult_usm.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + size_t array_size = 16; + cl::sycl::queue sycl_queue; + uint32_t *src = cl::sycl::malloc_device(array_size, sycl_queue); + uint32_t *dst = cl::sycl::malloc_device(array_size, sycl_queue); + sycl_queue.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>{array_size}, + [src, dst](cl::sycl::item<1> itemId) { + auto id = itemId.get_id(0); + dst[id] = src[id] * 2; + }); + }); + return 0; +} diff --git a/test/conformance/device_code/multiply.cpp b/test/conformance/device_code/multiply.cpp new file mode 100644 index 0000000000..070cef18fd --- /dev/null +++ b/test/conformance/device_code/multiply.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + + const size_t inputSize = 1; + sycl::queue sycl_queue; + uint32_t *inputArray = sycl::malloc_shared(inputSize, sycl_queue); + + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(inputSize), + [=](sycl::id<1> itemID) { inputArray[itemID] *= 2; }); + }); + return 0; +} diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt index 532cab1b85..7cc68203a0 100644 --- a/test/conformance/enqueue/CMakeLists.txt +++ b/test/conformance/enqueue/CMakeLists.txt @@ -29,4 +29,5 @@ add_conformance_test_with_kernels_environment(enqueue urEnqueueUSMPrefetch.cpp urEnqueueReadHostPipe.cpp urEnqueueWriteHostPipe.cpp + urEnqueueTimestampRecording.cpp ) diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp new file mode 100644 index 0000000000..5fc8ee5547 --- /dev/null +++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + bool timestamp_recording_support = false; + ASSERT_SUCCESS(uur::GetTimestampRecordingSupport( + device, timestamp_recording_support)); + if (!timestamp_recording_support) { + GTEST_SKIP() << "Timestamp recording is not supported"; + } + } + + void TearDown() override { urQueueTest::TearDown(); } +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueTimestampRecordingExpTest); + +void common_check(ur_event_handle_t event) { + // All successful runs should return a non-zero profiling results. + uint64_t queuedTime = 0, submitTime = 0, startTime = 0, endTime = 0; + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), &queuedTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_SUBMIT, + sizeof(uint64_t), &submitTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, + sizeof(uint64_t), &startTime, nullptr)); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, + sizeof(uint64_t), &endTime, + nullptr)); + ASSERT_TRUE(queuedTime > 0); + ASSERT_TRUE(submitTime > 0); + ASSERT_TRUE(startTime > 0); + ASSERT_TRUE(endTime > 0); + ASSERT_TRUE(queuedTime == submitTime); + ASSERT_TRUE(startTime == endTime); + ASSERT_TRUE(endTime >= submitTime); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, Success) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event)); + ASSERT_SUCCESS(urQueueFinish(queue)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, true, 0, nullptr, &event)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(nullptr, false, 0, nullptr, &event), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPointerEvent) { + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPtrEventWaitList) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 1, nullptr, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + ur_event_handle_t validEvent; + ASSERT_SUCCESS(urEnqueueEventsWait(queue, 0, nullptr, &validEvent)); + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &validEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_SUCCESS(urEventRelease(validEvent)); + + ur_event_handle_t invalidEvent = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &invalidEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); +} diff --git a/test/conformance/integration/CMakeLists.txt b/test/conformance/integration/CMakeLists.txt new file mode 100644 index 0000000000..1689cf04f4 --- /dev/null +++ b/test/conformance/integration/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_test_with_kernels_environment(integration + QueueEmptyStatus.cpp + QueueUSM.cpp + QueueBuffer.cpp + ) diff --git a/test/conformance/integration/QueueBuffer.cpp b/test/conformance/integration/QueueBuffer.cpp new file mode 100644 index 0000000000..d801ebf684 --- /dev/null +++ b/test/conformance/integration/QueueBuffer.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueBufferTestWithParam : uur::IntegrationQueueTestWithParam { + void SetUp() override { + program_name = "cpy_and_mult"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + } + + void TearDown() override { uur::IntegrationQueueTestWithParam::TearDown(); } + + void verifyResults(ur_mem_handle_t Buffer, uint32_t ExpectedValue) { + uint32_t HostMem[ArraySize] = {}; + ASSERT_SUCCESS(urEnqueueMemBufferRead(Queue, Buffer, true, 0, + sizeof(uint32_t) * ArraySize, + HostMem, 0, nullptr, nullptr)); + + for (uint32_t i : HostMem) { + ASSERT_EQ(i, ExpectedValue); + } + } + + ur_mem_handle_t Buffer1 = nullptr; + ur_mem_handle_t Buffer2 = nullptr; +}; + +UUR_TEST_SUITE_P(QueueBufferTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits multiple kernels that interact with each other by accessing and + * writing to the same buffers. + * Checks that when using an IN_ORDER queue, no synchronization is needed + * between calls to urEnqueueKernelLaunch. + * Checks that when using an OUT_OF_ORDER queue, synchronizing using only + * event barriers is enough. */ +TEST_P(QueueBufferTestWithParam, QueueBufferTest) { + + std::vector EventsFill; + ur_event_handle_t Event; + + size_t Buffer1Index; + size_t Buffer2Index; + ASSERT_NO_FATAL_FAILURE( + AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer1, &Buffer1Index)); + ASSERT_NO_FATAL_FAILURE( + AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer2, &Buffer2Index)); + + ASSERT_SUCCESS(urEnqueueMemBufferFill( + Queue, Buffer1, &InitialValue, sizeof(uint32_t), 0, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_SUCCESS(urEnqueueMemBufferFill( + Queue, Buffer2, &InitialValue, sizeof(uint32_t), 0, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill)); + + constexpr size_t GlobalOffset = 0; + constexpr size_t NDimensions = 1; + constexpr uint32_t NumIterations = 5; + + uint32_t CurValueMem1 = InitialValue; + uint32_t CurValueMem2 = InitialValue; + for (uint32_t i = 0; i < NumIterations; ++i) { + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer2)); + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem2 = CurValueMem1 * 2; + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer2)); + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem1 = CurValueMem2 * 2; + } + + ASSERT_SUCCESS(urQueueFinish(Queue)); + + ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer1, CurValueMem1)); + ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer2, CurValueMem2)); +} diff --git a/test/conformance/integration/QueueEmptyStatus.cpp b/test/conformance/integration/QueueEmptyStatus.cpp new file mode 100644 index 0000000000..b8f1517b70 --- /dev/null +++ b/test/conformance/integration/QueueEmptyStatus.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam { + + void SetUp() override { + + program_name = "multiply"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_flags = 0; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &SharedMem)); + } + + void TearDown() override { + ASSERT_SUCCESS(urUSMFree(context, SharedMem)); + uur::IntegrationQueueTestWithParam::TearDown(); + } + + void submitWorkToQueue() { + ur_event_handle_t Event; + ASSERT_SUCCESS( + urEnqueueUSMFill(Queue, SharedMem, sizeof(uint32_t), &InitialValue, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &SharedMem)); + + constexpr size_t global_offset = 0; + constexpr size_t n_dimensions = 1; + constexpr uint32_t num_iterations = 5; + for (uint32_t i = 0; i < num_iterations; ++i) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, n_dimensions, + &global_offset, &ArraySize, + nullptr, 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + } + + ASSERT_SUCCESS(urQueueFlush(Queue)); + } + + void waitUntilQueueEmpty() const { + + using namespace std::chrono_literals; + + constexpr auto step = 500ms; + constexpr auto maxWait = 5000ms; + + /* Wait a bit until work finishes running. We don't synchronize with + * urQueueFinish() because we want to check if the status is set without + * calling it explicitly. */ + for (auto currentWait = 0ms; currentWait < maxWait; + currentWait += step) { + std::this_thread::sleep_for(step); + + ur_bool_t is_queue_empty; + ASSERT_SUCCESS(urQueueGetInfo(Queue, UR_QUEUE_INFO_EMPTY, + sizeof(ur_bool_t), &is_queue_empty, + nullptr)); + if (is_queue_empty) { + return; + } + } + + /* If we are here, the test failed. Let's call queue finish to avoid + * issues when freeing memory */ + ASSERT_SUCCESS(urQueueFinish(Queue)); + GTEST_FAIL(); + } + + void *SharedMem = nullptr; +}; + +UUR_TEST_SUITE_P(QueueEmptyStatusTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits kernels that have a dependency on each other and checks that the + * queue submits all the work in the correct order to the device. + * Explicit synchronization (except for barriers) is avoided in these tests to + * check that the properties of In-Order and OutOfOrder queues are working as + * expected */ +TEST_P(QueueEmptyStatusTestWithParam, QueueEmptyStatusTest) { + ASSERT_NO_FATAL_FAILURE(submitWorkToQueue()); + ASSERT_NO_FATAL_FAILURE(waitUntilQueueEmpty()); + + constexpr size_t expected_value = 3200; + for (uint32_t i = 0; i < ArraySize; ++i) { + ASSERT_EQ(reinterpret_cast(SharedMem)[i], expected_value); + } +} diff --git a/test/conformance/integration/QueueUSM.cpp b/test/conformance/integration/QueueUSM.cpp new file mode 100644 index 0000000000..cc8201453a --- /dev/null +++ b/test/conformance/integration/QueueUSM.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueUSMTestWithParam : uur::IntegrationQueueTestWithParam { + void SetUp() override { + program_name = "cpy_and_mult_usm"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_flags = 0; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &DeviceMem1)); + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &DeviceMem2)); + } + + void TearDown() override { + ASSERT_SUCCESS(urUSMFree(context, DeviceMem1)); + ASSERT_SUCCESS(urUSMFree(context, DeviceMem2)); + uur::IntegrationQueueTestWithParam::TearDown(); + } + + void verifyResults(void *DeviceMem, uint32_t ExpectedValue) { + uint32_t HostMem[ArraySize] = {}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(Queue, true, HostMem, DeviceMem, + sizeof(uint32_t) * ArraySize, 0, + nullptr, nullptr)); + + for (uint32_t i : HostMem) { + ASSERT_EQ(i, ExpectedValue); + } + } + + void *DeviceMem1 = nullptr; + void *DeviceMem2 = nullptr; +}; + +UUR_TEST_SUITE_P(QueueUSMTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits multiple kernels that interact with each other by accessing and + * writing to the same USM memory locations. + * Checks that when using an IN_ORDER queue, no synchronization is needed + * between calls to urEnqueueKernelLaunch. + * Checks that when using an OUT_OF_ORDER queue, synchronizing using only + * event barriers is enough. */ +TEST_P(QueueUSMTestWithParam, QueueUSMTest) { + + std::vector EventsFill; + ur_event_handle_t Event; + ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem1, sizeof(uint32_t), + &InitialValue, ArraySize * sizeof(uint32_t), + 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem2, sizeof(uint32_t), + &InitialValue, ArraySize * sizeof(uint32_t), + 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill)); + + constexpr size_t GlobalOffset = 0; + constexpr size_t NDimensions = 1; + constexpr uint32_t NumIterations = 5; + + uint32_t CurValueMem1 = InitialValue; + uint32_t CurValueMem2 = InitialValue; + + std::vector EventsKernel; + + for (uint32_t i = 0; i < NumIterations; ++i) { + /* Copy from DeviceMem2 to DeviceMem1 and multiply by 2 */ + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem1)); + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem2)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem2 = CurValueMem1 * 2; + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem2)); + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem1 = CurValueMem2 * 2; + } + + ASSERT_SUCCESS(urQueueFinish(Queue)); + + ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem1, CurValueMem1)); + ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem2, CurValueMem2)); +} diff --git a/test/conformance/integration/fixtures.h b/test/conformance/integration/fixtures.h new file mode 100644 index 0000000000..aca70a5245 --- /dev/null +++ b/test/conformance/integration/fixtures.h @@ -0,0 +1,76 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef UR_CONFORMANCE_INTEGRATION_FIXTURES_H +#define UR_CONFORMANCE_INTEGRATION_FIXTURES_H + +#include + +namespace uur { + +struct IntegrationQueueTestWithParam + : uur::urKernelExecutionTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urKernelExecutionTestWithParam::SetUp()); + + QueueFlags = getParam(); + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, QueueFlags}; + ASSERT_SUCCESS( + urQueueCreate(context, device, &queue_properties, &Queue)); + } + + void TearDown() override { + for (ur_event_handle_t Event : AllEvents) { + ASSERT_SUCCESS(urEventRelease(Event)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::urKernelExecutionTestWithParam::TearDown()); + } + + void submitBarrierIfNeeded(std::vector &(Events)) { + if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier( + Queue, Events.size(), Events.data(), nullptr)); + AllEvents.insert(AllEvents.end(), Events.begin(), Events.end()); + } + } + + void submitBarrierIfNeeded(ur_event_handle_t Event) { + if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ASSERT_SUCCESS( + urEnqueueEventsWaitWithBarrier(Queue, 1, &Event, nullptr)); + AllEvents.push_back(Event); + } + } + + std::vector AllEvents; + ur_queue_flags_t QueueFlags{}; + ur_queue_handle_t Queue{}; + static constexpr size_t ArraySize = 100; + static constexpr uint32_t InitialValue = 100; + + static std::string + paramPrinter(const ::testing::TestParamInfo< + std::tuple> &info) { + auto device = std::get<0>(info.param); + auto param = std::get<1>(info.param); + + std::stringstream ss; + if (param == 0) { + ss << "IN_ORDER_QUEUE"; + } + if (param == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ss << "OUT_OF_ORDER_QUEUE"; + } + + return uur::GetPlatformAndDeviceName(device) + "__" + ss.str(); + } +}; +} // namespace uur + +#endif //UR_CONFORMANCE_INTEGRATION_FIXTURES_H diff --git a/test/conformance/integration/integration_adapter_cuda.match b/test/conformance/integration/integration_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/integration/integration_adapter_hip.match b/test/conformance/integration/integration_adapter_hip.match new file mode 100644 index 0000000000..7016ca68d6 --- /dev/null +++ b/test/conformance/integration/integration_adapter_hip.match @@ -0,0 +1,3 @@ +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/AMD_HIP_BACKEND___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/AMD_HIP_BACKEND___{{.*}}___OUT_OF_ORDER_QUEUE +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/integration/integration_adapter_level_zero.match b/test/conformance/integration/integration_adapter_level_zero.match new file mode 100644 index 0000000000..905fdea60f --- /dev/null +++ b/test/conformance/integration/integration_adapter_level_zero.match @@ -0,0 +1,6 @@ +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE +{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE +{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE diff --git a/test/conformance/integration/integration_adapter_native_cpu.match b/test/conformance/integration/integration_adapter_native_cpu.match new file mode 100644 index 0000000000..07afde2cef --- /dev/null +++ b/test/conformance/integration/integration_adapter_native_cpu.match @@ -0,0 +1,7 @@ +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/integration/integration_adapter_opencl.match b/test/conformance/integration/integration_adapter_opencl.match new file mode 100644 index 0000000000..57a5299327 --- /dev/null +++ b/test/conformance/integration/integration_adapter_opencl.match @@ -0,0 +1,6 @@ +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE diff --git a/test/conformance/program/program_adapter_native_cpu.match b/test/conformance/program/program_adapter_native_cpu.match index c509c67d3e..9a5b0a9830 100644 --- a/test/conformance/program/program_adapter_native_cpu.match +++ b/test/conformance/program/program_adapter_native_cpu.match @@ -138,3 +138,4 @@ {{OPT}}urProgramSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/program/urProgramBuild.cpp b/test/conformance/program/urProgramBuild.cpp index f99b94321e..a7e7e4a275 100644 --- a/test/conformance/program/urProgramBuild.cpp +++ b/test/conformance/program/urProgramBuild.cpp @@ -30,8 +30,7 @@ TEST_P(urProgramBuildTest, InvalidNullHandleProgram) { TEST_P(urProgramBuildTest, BuildFailure) { ur_program_handle_t program = nullptr; std::shared_ptr> il_binary; - uur::KernelsEnvironment::instance->LoadSource("build_failure", 0, - il_binary); + uur::KernelsEnvironment::instance->LoadSource("build_failure", il_binary); if (!il_binary) { // The build failure we are testing for happens at SYCL compile time on // AMD and Nvidia, so no binary exists to check for a build failure diff --git a/test/conformance/program/urProgramCreateWithIL.cpp b/test/conformance/program/urProgramCreateWithIL.cpp index 800a43cd5d..3d81d14104 100644 --- a/test/conformance/program/urProgramCreateWithIL.cpp +++ b/test/conformance/program/urProgramCreateWithIL.cpp @@ -17,7 +17,7 @@ struct urProgramCreateWithILTest : uur::urContextTest { if (backend == UR_PLATFORM_BACKEND_HIP) { GTEST_SKIP(); } - uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary); + uur::KernelsEnvironment::instance->LoadSource("foo", il_binary); } void TearDown() override { diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match index c2887b1063..a4c2c502a4 100644 --- a/test/conformance/queue/queue_adapter_native_cpu.match +++ b/test/conformance/queue/queue_adapter_native_cpu.match @@ -1,3 +1,5 @@ +urQueueCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urQueueCreateTest.CheckContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE @@ -9,6 +11,17 @@ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_C urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE_DEFAULT +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_DISCARD_EVENTS +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_LOW +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_HIGH +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_BATCHED +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp index 03cda76d50..168285d3f8 100644 --- a/test/conformance/queue/urQueueCreate.cpp +++ b/test/conformance/queue/urQueueCreate.cpp @@ -2,16 +2,24 @@ // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "uur/raii.h" #include using urQueueCreateTest = uur::urContextTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateTest); TEST_P(urQueueCreateTest, Success) { - ur_queue_handle_t queue = nullptr; - ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue)); + uur::raii::Queue queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr())); ASSERT_NE(nullptr, queue); - ASSERT_SUCCESS(urQueueRelease(queue)); + + ur_queue_info_t queue_flags; + ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, + sizeof(ur_queue_info_t), &queue_flags, + nullptr)); + + /* Check that the queue was created without any flag */ + ASSERT_EQ(queue_flags, 0); } using urQueueCreateWithParamTest = uur::urContextTestWithParam; @@ -54,9 +62,87 @@ TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) { nullptr)); ASSERT_TRUE(queueFlags & queryFlag); + // Check that no other bit is set (i.e. is power of 2) + ASSERT_TRUE(queueFlags != 0 && (queueFlags & (queueFlags - 1)) == 0); + ASSERT_SUCCESS(urQueueRelease(queue)); } +/* Creates two queues with the same platform and device, and checks that the + * queried device and platform of both queues match. */ +TEST_P(urQueueCreateWithParamTest, MatchingDeviceHandles) { + ur_queue_flags_t supportedFlags{}; + ASSERT_SUCCESS(uur::GetDeviceQueueOnHostProperties(device, supportedFlags)); + + ur_queue_flags_t queryFlag = getParam(); + if (!(supportedFlags & queryFlag)) { + GTEST_SKIP() << queryFlag << " : is not supported by the device."; + } + + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/queryFlag, + }; + + uur::raii::Queue queue1 = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue1.ptr())); + ASSERT_NE(queue1, nullptr); + + uur::raii::Queue queue2 = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue2.ptr())); + ASSERT_NE(queue2, nullptr); + + ur_device_handle_t deviceQueue1; + ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE, + sizeof(ur_device_handle_t), &deviceQueue1, + nullptr)); + + ur_device_handle_t deviceQueue2; + ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE, + sizeof(ur_device_handle_t), &deviceQueue2, + nullptr)); + + ASSERT_EQ(deviceQueue1, deviceQueue2); +} + +/* Create a queue and check that it returns the right context*/ +TEST_P(urQueueCreateTest, CheckContext) { + + uur::raii::Queue queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr())); + ASSERT_NE(queue.ptr(), nullptr); + + ur_context_handle_t returned_context = nullptr; + ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_CONTEXT, + sizeof(ur_context_handle_t), + &returned_context, nullptr)); + + ASSERT_EQ(this->context, returned_context); +} + +using urQueueCreateTestMultipleDevices = uur::urAllDevicesTest; + +/* Create a queue using a context from a different device */ +TEST_F(urQueueCreateTestMultipleDevices, ContextFromWrongDevice) { + + if (devices.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 devices in the system"; + } + ur_device_handle_t device1 = devices[0]; + uur::raii::Context context1 = nullptr; + urContextCreate(1, &device1, nullptr, context1.ptr()); + + ur_device_handle_t device2 = devices[1]; + uur::raii::Context context2 = nullptr; + urContextCreate(1, &device2, nullptr, context2.ptr()); + + ur_queue_handle_t queue = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_DEVICE, + urQueueCreate(context2, device1, nullptr, &queue)); + ASSERT_NE(queue, nullptr); +} + TEST_P(urQueueCreateTest, InvalidNullHandleContext) { ur_queue_handle_t queue = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp index a58b3ecdbd..a5f83c0d80 100644 --- a/test/conformance/source/environment.cpp +++ b/test/conformance/source/environment.cpp @@ -367,7 +367,7 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv, return options; } -std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) { +std::string KernelsEnvironment::getTargetName() { std::stringstream IL; if (instance->GetDevices().size() == 0) { @@ -382,66 +382,44 @@ std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) { error = "failed to get backend from platform."; return {}; } - if (backend == UR_PLATFORM_BACKEND_HIP) { - return ".bin"; - } - auto device = instance->GetDevices()[device_index]; - std::string IL_version; - if (uur::GetDeviceILVersion(device, IL_version)) { - error = "failed to get device IL version"; + std::string target = ""; + switch (backend) { + case UR_PLATFORM_BACKEND_OPENCL: + case UR_PLATFORM_BACKEND_LEVEL_ZERO: + return "spir64"; + case UR_PLATFORM_BACKEND_CUDA: + return "nvptx64-nvidia-cuda"; + case UR_PLATFORM_BACKEND_HIP: + return "amdgcn-amd-amdhsa"; + case UR_PLATFORM_BACKEND_NATIVE_CPU: + error = "native_cpu doesn't support kernel tests yet"; return {}; - } - - // TODO: This potentially needs updating as more adapters are tested. - if (IL_version.find("SPIR-V") != std::string::npos) { - IL << ".spv"; - } else if (IL_version.find("nvptx") != std::string::npos) { - IL << ".bin"; - } else { - error = "Undefined IL version: " + IL_version; + default: + error = "unknown target."; return {}; } - - return IL.str(); } std::string -KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name, - uint32_t device_index) { +KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name) { std::stringstream path; path << kernel_options.kernel_directory << "/" << kernel_name; - std::string il_postfix = getSupportedILPostfix(device_index); - - if (il_postfix.empty()) { - return {}; - } - - std::string binary_name; - for (const auto &entry : filesystem::directory_iterator(path.str())) { - auto file_name = entry.path().filename().string(); - if (file_name.find(il_postfix) != std::string::npos) { - binary_name = file_name; - break; - } - } - if (binary_name.empty()) { - error = - "failed retrieving kernel source path for kernel: " + kernel_name; + std::string target_name = getTargetName(); + if (target_name.empty()) { return {}; } - path << "/" << binary_name; + path << "/" << target_name << ".bin.0"; return path.str(); } void KernelsEnvironment::LoadSource( - const std::string &kernel_name, uint32_t device_index, + const std::string &kernel_name, std::shared_ptr> &binary_out) { - std::string source_path = - instance->getKernelSourcePath(kernel_name, device_index); + std::string source_path = instance->getKernelSourcePath(kernel_name); if (source_path.empty()) { FAIL() << error; diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h index bcb7e46661..acd255a3c2 100644 --- a/test/conformance/testing/include/uur/environment.h +++ b/test/conformance/testing/include/uur/environment.h @@ -72,7 +72,7 @@ struct KernelsEnvironment : DevicesEnvironment { virtual void SetUp() override; virtual void TearDown() override; - void LoadSource(const std::string &kernel_name, uint32_t device_index, + void LoadSource(const std::string &kernel_name, std::shared_ptr> &binary_out); ur_result_t CreateProgram(ur_platform_handle_t hPlatform, @@ -89,9 +89,8 @@ struct KernelsEnvironment : DevicesEnvironment { private: KernelOptions parseKernelOptions(int argc, char **argv, const std::string &kernels_default_dir); - std::string getKernelSourcePath(const std::string &kernel_name, - uint32_t device_index); - std::string getSupportedILPostfix(uint32_t device_index); + std::string getKernelSourcePath(const std::string &kernel_name); + std::string getTargetName(); KernelOptions kernel_options; // mapping between kernels (full_path + kernel_name) and their saved source. diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 55f3f7c4f4..cf64aa13d3 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -20,6 +20,14 @@ } \ (void)0 +#define UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(ret) \ + auto status = ret; \ + if (status == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { \ + GTEST_SKIP(); \ + } else { \ + ASSERT_EQ(status, UR_RESULT_SUCCESS); \ + } + namespace uur { struct urPlatformTest : ::testing::Test { @@ -321,7 +329,8 @@ template struct urMemImageTestWithParam : urContextTestWithParam { struct urQueueTest : urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp()); - ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + ASSERT_SUCCESS( + urQueueCreate(context, device, &queue_properties, &queue)); ASSERT_NE(queue, nullptr); } @@ -332,6 +341,8 @@ struct urQueueTest : urContextTest { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::TearDown()); } + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0}; ur_queue_handle_t queue = nullptr; }; @@ -339,7 +350,7 @@ struct urHostPipeTest : urQueueTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary)); + uur::KernelsEnvironment::instance->LoadSource("foo", il_binary)); ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( platform, context, device, *il_binary, nullptr, &program)); @@ -382,7 +393,9 @@ struct urHostPipeTest : urQueueTest { template struct urQueueTestWithParam : urContextTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); - ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue)); + ASSERT_SUCCESS(urQueueCreate(this->context, this->device, + &queue_properties, &queue)); + ASSERT_NE(queue, nullptr); } void TearDown() override { @@ -391,7 +404,8 @@ template struct urQueueTestWithParam : urContextTestWithParam { } UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); } - + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0}; ur_queue_handle_t queue; }; @@ -1121,7 +1135,7 @@ struct urProgramTest : urQueueTest { GTEST_SKIP(); } UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource(program_name, 0, + uur::KernelsEnvironment::instance->LoadSource(program_name, il_binary)); const ur_program_properties_t properties = { @@ -1145,9 +1159,9 @@ struct urProgramTest : urQueueTest { std::vector metadatas{}; }; -template struct urProgramTestWithParam : urContextTestWithParam { +template struct urProgramTestWithParam : urQueueTestWithParam { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::SetUp()); ur_platform_backend_t backend; ASSERT_SUCCESS(urPlatformGetInfo(this->platform, @@ -1160,7 +1174,7 @@ template struct urProgramTestWithParam : urContextTestWithParam { } UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource(program_name, 0, + uur::KernelsEnvironment::instance->LoadSource(program_name, il_binary)); ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( this->platform, this->context, this->device, *il_binary, nullptr, @@ -1171,7 +1185,7 @@ template struct urProgramTestWithParam : urContextTestWithParam { if (program) { EXPECT_SUCCESS(urProgramRelease(program)); } - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::TearDown()); } std::shared_ptr> il_binary; @@ -1246,20 +1260,16 @@ template struct urKernelTestWithParam : urBaseKernelTestWithParam { } }; -struct urBaseKernelExecutionTest : urBaseKernelTest { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp()); - } +struct KernelLaunchHelper { - void TearDown() override { - for (auto &buffer : buffer_args) { - ASSERT_SUCCESS(urMemRelease(buffer)); - } - UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown()); - } + KernelLaunchHelper(ur_platform_handle_t &platform, + ur_context_handle_t &context, ur_kernel_handle_t &kernel, + ur_queue_handle_t &queue) + : platform{platform}, context{context}, kernel{kernel}, queue{queue} {} // Adds a kernel arg representing a sycl buffer constructed with a 1D range. - void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index) { ur_mem_handle_t mem_handle = nullptr; ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, nullptr, &mem_handle)); @@ -1270,6 +1280,9 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { ASSERT_SUCCESS(urQueueFinish(queue)); ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, mem_handle)); + if (buffer_index) { + *buffer_index = current_arg_index; + } // SYCL device kernels have different interfaces depending on the // backend being used. Typically a kernel which takes a buffer argument @@ -1301,8 +1314,6 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { &accessor)); current_arg_index += 2; } - - buffer_args.push_back(mem_handle); *out_buffer = mem_handle; } @@ -1340,7 +1351,11 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { ValidateBuffer(buffer, size, validator); } - std::vector buffer_args; + ur_platform_handle_t &platform; + ur_context_handle_t &context; + ur_kernel_handle_t &kernel; + ur_queue_handle_t &queue; + uint32_t current_arg_index = 0; }; @@ -1349,10 +1364,6 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::Build()); - context = urBaseKernelTestWithParam::context; - kernel = urBaseKernelTestWithParam::kernel; - ASSERT_SUCCESS(urQueueCreate( - context, urBaseKernelTestWithParam::device, 0, &queue)); } void TearDown() override { @@ -1360,91 +1371,85 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { ASSERT_SUCCESS(urMemRelease(buffer)); } UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::TearDown()); - if (queue) { - EXPECT_SUCCESS(urQueueRelease(queue)); - } } - // Adds a kernel arg representing a sycl buffer constructed with a 1D range. - void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { - ur_mem_handle_t mem_handle = nullptr; - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, - nullptr, &mem_handle)); - char zero = 0; - ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero, - sizeof(zero), 0, size, 0, nullptr, - nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, - mem_handle)); + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index = nullptr) { + helper.AddBuffer1DArg(size, out_buffer, buffer_index); + buffer_args.push_back(*out_buffer); + } - // SYCL device kernels have different interfaces depending on the - // backend being used. Typically a kernel which takes a buffer argument - // will take a pointer to the start of the buffer and a sycl::id param - // which is a struct that encodes the accessor to the buffer. However - // the AMD backend handles this differently and uses three separate - // arguments for each of the three dimensions of the accessor. + template void AddPodArg(K data) { helper.AddPodArg(data); } - ur_platform_backend_t backend; - ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam::platform, - UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - // this emulates the three offset params for buffer accessor on AMD. - size_t val = 0; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, - sizeof(size_t), nullptr, &val)); - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, - sizeof(size_t), nullptr, &val)); - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, - sizeof(size_t), nullptr, &val)); - current_arg_index += 4; - } else { - // This emulates the offset struct sycl adds for a 1D buffer accessor. - struct { - size_t offsets[1] = {0}; - } accessor; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, - sizeof(accessor), nullptr, - &accessor)); - current_arg_index += 2; + void Launch1DRange(size_t global_size, size_t local_size = 1) { + helper.Launch1DRange(global_size, local_size); + } + + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, + std::function validator) { + helper.ValidateBuffer(buffer, size, validator); + } + + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, K value) { + helper.ValidateBuffer(buffer, size, value); + } + + private: + KernelLaunchHelper helper = KernelLaunchHelper{ + this->platform, this->context, this->kernel, this->queue}; + std::vector buffer_args; +}; + +struct urBaseKernelExecutionTest : urBaseKernelTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp()); + } + + void TearDown() override { + for (auto &buffer : buffer_args) { + ASSERT_SUCCESS(urMemRelease(buffer)); } + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown()); + } - buffer_args.push_back(mem_handle); - *out_buffer = mem_handle; + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index = nullptr) { + helper.AddBuffer1DArg(size, out_buffer, buffer_index); + buffer_args.push_back(*out_buffer); } - template void AddPodArg(U data) { - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index, - sizeof(data), nullptr, &data)); - current_arg_index++; + template void AddPodArg(T data) { helper.AddPodArg(data); } + + void Launch1DRange(size_t global_size, size_t local_size = 1) { + helper.Launch1DRange(global_size, local_size); } - // Validate the contents of `buffer` according to the given validator. - template + template void ValidateBuffer(ur_mem_handle_t buffer, size_t size, - std::function validator) { - std::vector read_buffer(size / sizeof(U)); - ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size, - read_buffer.data(), 0, nullptr, - nullptr)); - ASSERT_TRUE( - std::all_of(read_buffer.begin(), read_buffer.end(), validator)); + std::function validator) { + helper.ValidateBuffer(buffer, size, validator); } - // Helper that uses the generic validate function to check for a given value. - template - void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) { - auto validator = [&value](U result) -> bool { return result == value; }; - - ValidateBuffer(buffer, size, validator); + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, T value) { + helper.ValidateBuffer(buffer, size, value); } + private: + KernelLaunchHelper helper = + KernelLaunchHelper{platform, context, kernel, queue}; std::vector buffer_args; - uint32_t current_arg_index = 0; - ur_context_handle_t context; - ur_kernel_handle_t kernel; - ur_queue_handle_t queue; +}; + +template +struct urKernelExecutionTestWithParam : urBaseKernelExecutionTestWithParam { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE( + urBaseKernelExecutionTestWithParam::SetUp()); + this->Build(); + } }; struct urKernelExecutionTest : urBaseKernelExecutionTest { diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h index d2341cc31b..34e5f7768b 100644 --- a/test/conformance/testing/include/uur/utils.h +++ b/test/conformance/testing/include/uur/utils.h @@ -395,6 +395,8 @@ ur_result_t GetDeviceMaxComputeQueueIndices(ur_device_handle_t device, uint32_t &max_indices); ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, bool &support); +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp index 6cb8d3a90c..ab78f64bbc 100644 --- a/test/conformance/testing/source/utils.cpp +++ b/test/conformance/testing/source/utils.cpp @@ -635,6 +635,12 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, support); } +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support) { + return GetDeviceInfo( + device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support); +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS; diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match index dbb7cdebd5..9cda954748 100644 --- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match +++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match @@ -1,88 +1,10 @@ -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_INFO_ACCESS_MODE -urVirtualMemGetInfoTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTest.InvalidEnumerationInfo/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM -urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidPropSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidEnumerationFlags/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 -urVirtualMemReserveTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemReserveTest.InvalidNullPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 752a3a839f..caf61e86f6 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -401,5 +401,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo( hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); } } // namespace urinfo