From 9aceab4abdbe99f9e2edc2c81dba3fc9147c9e68 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Wed, 17 Jan 2024 06:58:30 -0800 Subject: [PATCH 01/18] Add enqueue timestamp recording extension This commit adds a new extension feature for recording timestamps into events, the information from which can be queried using the existing profiling queries. Signed-off-by: Larsen, Steffen --- include/ur_api.h | 57 ++++++- include/ur_ddi.h | 10 ++ include/ur_print.h | 8 + include/ur_print.hpp | 66 ++++++++ .../core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst | 70 +++++++++ .../core/exp-enqueue-timestamp-recording.yml | 66 ++++++++ scripts/core/registry.yml | 3 + source/adapters/cuda/device.cpp | 4 + source/adapters/cuda/enqueue.cpp | 30 ++++ source/adapters/cuda/ur_interface_loader.cpp | 1 + source/adapters/hip/device.cpp | 2 + source/adapters/hip/enqueue.cpp | 31 ++++ source/adapters/hip/ur_interface_loader.cpp | 1 + source/adapters/level_zero/device.cpp | 3 + source/adapters/level_zero/event.cpp | 144 +++++++++++++++++- source/adapters/level_zero/event.hpp | 12 ++ source/adapters/level_zero/queue.cpp | 36 +++++ source/adapters/level_zero/queue.hpp | 18 +++ .../level_zero/ur_interface_loader.cpp | 1 + source/adapters/native_cpu/device.cpp | 2 + source/adapters/native_cpu/event.cpp | 12 ++ .../native_cpu/ur_interface_loader.cpp | 1 + source/adapters/null/ur_nullddi.cpp | 44 ++++++ source/adapters/opencl/device.cpp | 3 + source/adapters/opencl/event.cpp | 6 + .../adapters/opencl/ur_interface_loader.cpp | 1 + source/loader/layers/tracing/ur_trcddi.cpp | 50 ++++++ source/loader/layers/validation/ur_valddi.cpp | 65 +++++++- source/loader/ur_ldrddi.cpp | 65 ++++++++ source/loader/ur_libapi.cpp | 48 +++++- source/loader/ur_print.cpp | 8 + source/ur_api.cpp | 40 ++++- test/conformance/enqueue/CMakeLists.txt | 1 + .../enqueue/urEnqueueTimestampRecording.cpp | 94 ++++++++++++ test/conformance/testing/include/uur/utils.h | 2 + test/conformance/testing/source/utils.cpp | 6 + tools/urinfo/urinfo.hpp | 3 + 37 files changed, 1008 insertions(+), 6 deletions(-) create mode 100644 scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst create mode 100644 scripts/core/exp-enqueue-timestamp-recording.yml create mode 100644 test/conformance/enqueue/urEnqueueTimestampRecording.cpp diff --git a/include/ur_api.h b/include/ur_api.h index 7ba79f4e13..5677f1185a 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -221,6 +221,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 218, ///< Enumerator for ::urCommandBufferGetInfoExp UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 219, ///< Enumerator for ::urCommandBufferCommandGetInfoExp UR_FUNCTION_DEVICE_GET_SELECTED = 220, ///< Enumerator for ::urDeviceGetSelected + UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 221, ///< Enumerator for ::urEnqueueTimestampRecordingExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1622,6 +1623,7 @@ typedef enum ur_device_info_t { ///< semaphore resources UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal ///< event resources + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports timestamp recording /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1647,7 +1649,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -5552,6 +5554,7 @@ typedef enum ur_command_t { UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp + UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -8741,6 +8744,46 @@ urKernelSuggestMaxCooperativeGroupCountExp( uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings +#if !defined(__GNUC__) +#pragma region enqueue timestamp recording(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10432,6 +10475,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_cooperative_kernel_launch_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueTimestampRecordingExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_timestamp_recording_exp_params_t { + ur_queue_handle_t *phQueue; + bool *pblocking; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_timestamp_recording_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 88a37f2ff5..d271e6cc52 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1438,10 +1438,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)( const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueTimestampRecordingExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( + ur_queue_handle_t, + bool, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; + ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/include/ur_print.h b/include/ur_print.h index b4675aee02..d36c6f68f0 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1938,6 +1938,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 35f0f2e9df..cadae960ac 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -912,6 +912,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_DEVICE_GET_SELECTED: os << "UR_FUNCTION_DEVICE_GET_SELECTED"; break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: + os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -2537,6 +2540,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP: os << "UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4141,6 +4147,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8646,6 +8664,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP: os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP"; break; + case UR_COMMAND_TIMESTAMP_RECORDING_EXP: + os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -13924,6 +13945,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".blocking = "; + + os << *(params->pblocking); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type /// @returns @@ -16943,6 +17006,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: { os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { + os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: { os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params; } break; diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst new file mode 100644 index 0000000000..25407d51d6 --- /dev/null +++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst @@ -0,0 +1,70 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-timestamp-recording: + +================================================================================ +Enqueue Timestamp Recording +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Currently, the only way to get timestamp information is through enabling +profiling on a queue and retrieve the information from events coming from +commands submitted to it. However, not all systems give full control of the +queue construction to the programmer wanting the profiling information. To amend +this, this extension adds the ability to enqueue a timestamp recording on any +queue, with or without profiling enabled. This event can in turn be queried for +the usual profiling information. + + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP + +* ${x}_command_t + * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueTimestampRecordingExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Steffen Larsen `steffen.larsen@intel.com `_ diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml new file mode 100644 index 0000000000..16fa700ff2 --- /dev/null +++ b/scripts/core/exp-enqueue-timestamp-recording.yml @@ -0,0 +1,66 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for enqueuing timestamp recordings" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support timestamp recordings." +name: $x_device_info_t +etors: + - name: TIMESTAMP_RECORDING_SUPPORT_EXP + value: "0x2010" + desc: "[$x_bool_t] returns true if the device supports timestamp recording" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Command Type experimental enumerations." +name: $x_command_t +etors: + - name: TIMESTAMP_RECORDING_EXP + value: "0x2002" + desc: Event created by $xEnqueueTimestampRecordingExp +--- #-------------------------------------------------------------------------- +type: function +desc: "Enqueue a command for recording the device timestamp" +class: $xEnqueue +name: TimestampRecordingExp +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: bool + name: blocking + desc: | + [in] indicates whether the call to this function should block until + until the device timestamp recording command has executed on the + device. + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + - type: $x_event_handle_t* + name: phEvent + desc: | + [in,out] return an event object that identifies this particular kernel execution instance. Profiling information can be queried + from this event as if `hQueue` had profiling enabled. Querying `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + reports the timestamp at the time of the call to this function. Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + reports the timestamp recorded when the command is executed on the device. +returns: + - $X_RESULT_ERROR_INVALID_NULL_HANDLE + - $X_RESULT_ERROR_INVALID_NULL_POINTER + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 3e86e109c3..cbd7cf97ac 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -577,6 +577,9 @@ etors: - name: DEVICE_GET_SELECTED desc: Enumerator for $xDeviceGetSelected value: '220' +- name: ENQUEUE_TIMESTAMP_RECORDING_EXP + desc: Enumerator for $xEnqueueTimestampRecordingExp + value: '221' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index b33ad6c792..bccf90fed5 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -902,6 +902,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA does not support exporting semaphores or events. return ReturnValue(false); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + // CUDA supports recording timestamp events. + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index c8ae115df3..b802aff580 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -1756,3 +1756,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextComputeStream(); + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 52e8570b17..7ba61fd713 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -406,6 +406,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index 7c9142f3c7..1cf1ce11ba 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -829,6 +829,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMPOSITE_DEVICE: // These two are exclusive of L0. return ReturnValue(0); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(true); // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_GPU_EU_COUNT: diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 24ba905688..e819f119ca 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -1895,3 +1895,34 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice : hipMemcpyHostToHost)); } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index b7e3224b71..4e87821367 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -376,6 +376,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 918b04400a..7366410f5b 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -852,6 +852,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { return ReturnValue(static_cast(true)); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + return ReturnValue(static_cast(true)); + } case UR_DEVICE_INFO_ESIMD_SUPPORT: { // ESIMD is only supported by Intel GPUs. diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 57b839a714..cf10128210 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -461,8 +461,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ) { std::shared_lock EventLock(Event->Mutex); - if (Event->UrQueue && - (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { + // The event must either have profiling enabled or be recording timestamps. + bool isTimestampedEvent = Event->isTimestamped(); + if (!Event->isProfilingEnabled() && !isTimestampedEvent) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -475,6 +476,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + // For timestamped events we have the timestamps ready directly on the event + // handle, so we short-circuit the return. + if (isTimestampedEvent) { + uint64_t ContextStartTime = Event->RecordEventStartTimestamp; + switch (PropName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + return ReturnValue(ContextStartTime); + case UR_PROFILING_INFO_COMMAND_END: + case UR_PROFILING_INFO_COMMAND_START: { + // If RecordEventEndTimestamp on the event is non-zero it means it has + // collected the result of the queue already. In that case it has been + // adjusted and is ready for immediate return. + if (Event->RecordEventEndTimestamp) + return ReturnValue(Event->RecordEventEndTimestamp); + + // Otherwise we need to collect it from the queue. + auto Entry = Event->UrQueue->EndTimeRecordings.find(Event); + + // Unexpected state if there is no end-time record. + if (Entry == Event->UrQueue->EndTimeRecordings.end()) + return UR_RESULT_ERROR_UNKNOWN; + auto &EndTimeRecording = Entry->second; + + // End time needs to be adjusted for resolution and valid bits. + uint64_t ContextEndTime = + ((*EndTimeRecording.RecordEventEndTimestamp) & TimestampMaxValue) * + ZeTimerResolution; + + // If the result is 0, we have not yet gotten results back and so we just + // return it. + if (ContextEndTime == 0) + return ReturnValue(ContextEndTime); + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < ContextStartTime) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Now that we have the result, there is no need to keep it in the queue + // anymore, so we cache it on the event and evict the record from the + // queue. + Event->RecordEventEndTimestamp = ContextEndTime; + free(EndTimeRecording.RecordEventEndTimestamp); + Event->UrQueue->EndTimeRecordings.erase(Entry); + + return ReturnValue(ContextEndTime); + } + default: + urPrint("urEventGetProfilingInfo: not supported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + ze_kernel_timestamp_result_t tsResult; // A Command-buffer consists of three command-lists for which only a single @@ -583,6 +640,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + bool Blocking, ///< [in] blocking or non-blocking enqueue + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out] return an event object that identifies + ///< this particular command instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + ur_device_handle_t Device = Queue->Device; + + uint64_t DeviceStartTimestamp = 0; + UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, /* AllowBatching */ false)); + + UR_CALL(createEventAndAssociateQueue( + Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList, + /* IsInternal */ false, /* HostVisible */ true)); + ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent; + (*OutEvent)->WaitList = TmpWaitList; + (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; + + // Allocate new entry in the queue's recordings. + uint64_t *EndTimestampPtr = (uint64_t *)malloc(sizeof(uint64_t)); + *EndTimestampPtr = 0; + Queue->EndTimeRecordings[*OutEvent] = ur_queue_handle_t_::end_time_recording{ + EndTimestampPtr, /*EventHasDied=*/false}; + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (CommandList->first, EndTimestampPtr, ZeEvent, + (*OutEvent)->WaitList.Length, (*OutEvent)->WaitList.ZeEventList)); + + UR_CALL( + Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false)); + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( ze_event_handle_t &ZeHostVisibleEvent) { @@ -876,6 +989,24 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { Event->Context->addEventToContextCache(Event); } + // If the event was a timestamp recording, we try to evict its entry in the + // queue. + if (Event->isTimestamped()) { + auto Entry = Queue->EndTimeRecordings.find(Event); + if (Entry != Queue->EndTimeRecordings.end()) { + auto &EndTimeRecording = Entry->second; + if ((EndTimeRecording.RecordEventEndTimestamp) == 0) { + // If the end time recording has not finished, we tell the queue that + // the event is no longer alive to avoid invalid write-backs. + EndTimeRecording.EventHasDied = true; + } else { + // Otherwise we evict the entry. + free(EndTimeRecording.RecordEventEndTimestamp); + Event->UrQueue->EndTimeRecordings.erase(Entry); + } + } + } + // We intentionally incremented the reference counter when an event is // created so that we can avoid ur_queue_handle_t is released before the // associated ur_event_handle_t is released. Here we have to decrement it so @@ -1394,3 +1525,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const { return !UrQueue || // tentatively assume user events are profiling enabled (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; } + +// Tells if this event was created as a timestamp event, allowing profiling +// info even if profiling is not enabled. +bool ur_event_handle_t_::isTimestamped() const { + // If we are recording, the start time of the event will be non-zero. The + // end time might still be missing, depending on whether the corresponding + // enqueue is still running. + return RecordEventStartTimestamp != 0; +} diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index c266de8c0d..abc5f0f038 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -196,6 +196,14 @@ struct ur_event_handle_t_ : _ur_object { // performance bool IsMultiDevice = {false}; + // Indicates the recorded start and end timestamps for the event. These are + // only set for events returned by timestamp recording enqueue functions. + // A non-zero value for RecordEventStartTimestamp indicates the event was the + // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it + // means the event has fetched the end-timestamp from the queue. + uint64_t RecordEventStartTimestamp = 0; + uint64_t RecordEventEndTimestamp = 0; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced @@ -220,6 +228,10 @@ struct ur_event_handle_t_ : _ur_object { // Tells if this event is with profiling capabilities. bool isProfilingEnabled() const; + // Tells if this event was created as a timestamp event, allowing profiling + // info even if profiling is not enabled. + bool isTimestamped() const; + // Get the host-visible event or create one and enqueue its signal. ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); }; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 2009c3c6f5..f9c29d0a38 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1295,6 +1295,35 @@ ur_result_t ur_queue_handle_t_::active_barriers::clear() { return UR_RESULT_SUCCESS; } +void ur_queue_handle_t_::clearEndTimeRecordings() { + uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxValue = + ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); + + for (auto Entry : EndTimeRecordings) { + auto &Event = Entry.first; + auto &EndTimeRecording = Entry.second; + if (!Entry.second.EventHasDied) { + // Write the result back to the event if it is not dead. + uint64_t ContextEndTime = + ((*EndTimeRecording.RecordEventEndTimestamp) & TimestampMaxValue) * + ZeTimerResolution; + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < Event->RecordEventStartTimestamp) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Store it in the event. + Event->RecordEventEndTimestamp = ContextEndTime; + } + free(EndTimeRecording.RecordEventEndTimestamp); + } + EndTimeRecordings.clear(); +} + ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { ur_queue_handle_t UrQueue = reinterpret_cast(Queue); @@ -1320,6 +1349,8 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { } } + Queue->clearEndTimeRecordings(); + urPrint("urQueueRelease(compute) NumTimesClosedFull %d, " "NumTimesClosedEarly %d\n", UrQueue->ComputeCommandBatch.NumTimesClosedFull, @@ -1450,6 +1481,11 @@ ur_result_t ur_queue_handle_t_::synchronize() { LastCommandEvent = nullptr; } + // Since all timestamp recordings should have finished with the + // synchronizations, we can clear the map and write the results to the owning + // events. + clearEndTimeRecordings(); + // With the entire queue synchronized, the active barriers must be done so we // can remove them. if (auto Res = ActiveBarriers.clear()) diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 06751e03c1..b904501290 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -346,6 +346,24 @@ struct ur_queue_handle_t_ : _ur_object { std::vector> EventCachesDeviceMap{2}; + // End-times enqueued are stored on the queue rather than on the event to + // avoid the event objects having been destroyed prior to the write to the + // end-time member. + struct end_time_recording { + // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as + // it is written asynchronously. We use a heap allocation for these so they + // do not move after the timestamp recording is given the address to write + // the result to. + uint64_t *RecordEventEndTimestamp = nullptr; + // The event may die before the recording has been written back. In this + // case the event will mark this for deletion when the queue sees fit. + bool EventHasDied = false; + }; + std::map EndTimeRecordings; + + // Clear the end time recording timestamps entries. + void clearEndTimeRecordings(); + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 3c56010e7c..a0bf082875 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -453,6 +453,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index dfabfb81e5..f551ab2750 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -313,6 +313,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(false); default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 112bb553c0..9049e3c1b6 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -87,3 +87,15 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = blocking; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + DIE_NO_IMPLEMENTATION; +} diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 0c48ee1fb3..7a4dc53453 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -391,6 +391,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( } pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 5296123911..ea49a6c441 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5468,6 +5468,47 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnTimestampRecordingExp = + d_context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr != pfnTimestampRecordingExp) { + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + // generic implementation + *phEvent = reinterpret_cast(d_context.get()); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6024,6 +6065,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = driver::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + driver::urEnqueueTimestampRecordingExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 115b9b2e09..b2baec8417 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -780,6 +780,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { return ReturnValue(false); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + return ReturnValue(false); + } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: { bool Supported = false; CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp index d180cfb097..39c84397f7 100644 --- a/source/adapters/opencl/event.cpp +++ b/source/adapters/opencl/event.cpp @@ -263,3 +263,9 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, CallbackType, ClCallback, Callback)); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index dd21d8d0b0..88d5e396f8 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -397,6 +397,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 0609d93737..a3ce985777 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -6069,6 +6069,52 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_timestamp_recording_exp_params_t params = { + &hQueue, &blocking, &numEventsInWaitList, &phEventWaitList, &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms); + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms, &result, + instance); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6776,6 +6822,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_tracing_layer::urEnqueueTimestampRecordingExp; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 1ad5cede4d..edd2f56506 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP < propName) { + if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -8863,6 +8863,65 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -9672,6 +9731,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_validation_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_validation_layer::urEnqueueTimestampRecordingExp; + return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 7ca6bc42b9..71e0717a6d 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7599,6 +7599,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnTimestampRecordingExp = + dditable->ur.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitListLocal.data(), phEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phEvent = reinterpret_cast( + ur_event_factory.getInstance(*phEvent, dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -8319,6 +8382,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( // return pointers to loader's DDIs pDdiTable->pfnCooperativeKernelLaunchExp = ur_loader::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + ur_loader::urEnqueueTimestampRecordingExp; } else { // return pointers directly to platform's DDIs *pDdiTable = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 7cbbdffb1c..ad6bf59e7c 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8187,6 +8187,52 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + auto pfnTimestampRecordingExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 1e8ad88086..e3c95f3b71 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1645,6 +1645,14 @@ ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueTimestampRecordingExpParams( + const struct ur_enqueue_timestamp_recording_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 665e75548b..760dd0378a 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -6915,6 +6915,44 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait + ///< event. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt index 532cab1b85..7cc68203a0 100644 --- a/test/conformance/enqueue/CMakeLists.txt +++ b/test/conformance/enqueue/CMakeLists.txt @@ -29,4 +29,5 @@ add_conformance_test_with_kernels_environment(enqueue urEnqueueUSMPrefetch.cpp urEnqueueReadHostPipe.cpp urEnqueueWriteHostPipe.cpp + urEnqueueTimestampRecording.cpp ) diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp new file mode 100644 index 0000000000..5fc8ee5547 --- /dev/null +++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + bool timestamp_recording_support = false; + ASSERT_SUCCESS(uur::GetTimestampRecordingSupport( + device, timestamp_recording_support)); + if (!timestamp_recording_support) { + GTEST_SKIP() << "Timestamp recording is not supported"; + } + } + + void TearDown() override { urQueueTest::TearDown(); } +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueTimestampRecordingExpTest); + +void common_check(ur_event_handle_t event) { + // All successful runs should return a non-zero profiling results. + uint64_t queuedTime = 0, submitTime = 0, startTime = 0, endTime = 0; + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), &queuedTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_SUBMIT, + sizeof(uint64_t), &submitTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, + sizeof(uint64_t), &startTime, nullptr)); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, + sizeof(uint64_t), &endTime, + nullptr)); + ASSERT_TRUE(queuedTime > 0); + ASSERT_TRUE(submitTime > 0); + ASSERT_TRUE(startTime > 0); + ASSERT_TRUE(endTime > 0); + ASSERT_TRUE(queuedTime == submitTime); + ASSERT_TRUE(startTime == endTime); + ASSERT_TRUE(endTime >= submitTime); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, Success) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event)); + ASSERT_SUCCESS(urQueueFinish(queue)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, true, 0, nullptr, &event)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(nullptr, false, 0, nullptr, &event), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPointerEvent) { + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPtrEventWaitList) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 1, nullptr, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + ur_event_handle_t validEvent; + ASSERT_SUCCESS(urEnqueueEventsWait(queue, 0, nullptr, &validEvent)); + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &validEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_SUCCESS(urEventRelease(validEvent)); + + ur_event_handle_t invalidEvent = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &invalidEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); +} diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h index 79620e4b11..a1eab980d4 100644 --- a/test/conformance/testing/include/uur/utils.h +++ b/test/conformance/testing/include/uur/utils.h @@ -395,6 +395,8 @@ ur_result_t GetDeviceMaxComputeQueueIndices(ur_device_handle_t device, uint32_t &max_indices); ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, bool &support); +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp index de1c33be08..0cff6100d1 100644 --- a/test/conformance/testing/source/utils.cpp +++ b/test/conformance/testing/source/utils.cpp @@ -635,6 +635,12 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, support); } +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support) { + return GetDeviceInfo( + device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support); +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS; diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 15894cafb8..cc51799178 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -378,5 +378,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo( hDevice, UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); } } // namespace urinfo From 578f347076c4aac7acead2269347b36383aa784c Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 09:33:36 -0800 Subject: [PATCH 02/18] Address comments Signed-off-by: Larsen, Steffen --- source/adapters/cuda/enqueue.cpp | 2 +- source/adapters/hip/enqueue.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index b802aff580..e550190333 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -1764,7 +1764,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextComputeStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index e819f119ca..a9c6a5b032 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -1903,6 +1903,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { + ScopedContext Active(hQueue->getDevice()); + uint32_t StreamToken; ur_stream_quard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( From b7b0a6f54444cc189db4dd53416c39d649d94146 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 10:02:26 -0800 Subject: [PATCH 03/18] Revert CUDA change Signed-off-by: Larsen, Steffen --- source/adapters/cuda/enqueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index e550190333..b802aff580 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -1764,7 +1764,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); + ScopedContext Active(hQueue->getContext()); CUstream CuStream = hQueue->getNextComputeStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, From e08bf5256a109d2609f6b38479199cabd4f739b8 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 22:50:05 -0800 Subject: [PATCH 04/18] Correctly enable recordings in HIP and CUDA for timestamp events Signed-off-by: Larsen, Steffen --- source/adapters/cuda/event.cpp | 7 ++++--- source/adapters/cuda/event.hpp | 4 ++++ source/adapters/hip/event.cpp | 10 ++++++---- source/adapters/hip/event.hpp | 4 ++++ 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 804b35a9b7..373183259b 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); @@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() { UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(cuEventDestroy(EvQueued)); UR_CHECK_ERROR(cuEventDestroy(EvStart)); } @@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || + !isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 390fd7833a..8fa5530e1b 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -57,6 +57,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 313212724a..15d7f80275 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr}, EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + bool ProfilingEnabled = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent(); UR_CHECK_ERROR(hipEventCreateWithFlags( &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); @@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(hipEventRecord(EvQueued, 0)); UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get())); @@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() { assert(Queue != nullptr); UR_CHECK_ERROR(hipEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(hipEventDestroy(EvQueued)); UR_CHECK_ERROR(hipEventDestroy(EvStart)); } @@ -249,7 +250,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || + !isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 50de73b14f..64e8b2d9c8 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -55,6 +55,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } From bf5ea14bfc05af4315b695c201c23f405237d0d8 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 23:13:41 -0800 Subject: [PATCH 05/18] Fix use of event handle Signed-off-by: Larsen, Steffen --- source/adapters/cuda/event.cpp | 2 +- source/adapters/hip/event.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 373183259b..b6c6680a3d 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -191,7 +191,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ur_queue_handle_t Queue = hEvent->getQueue(); if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || - !isTimestampEvent()) { + !hEvent->isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 15d7f80275..9de1036709 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -251,7 +251,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ur_queue_handle_t Queue = hEvent->getQueue(); if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || - !isTimestampEvent()) { + !hEvent->isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } From 206c4b142c04f5473b69cf4440f21ac70e967486 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 23:27:50 -0800 Subject: [PATCH 06/18] Fix faulty disjunction Signed-off-by: Larsen, Steffen --- source/adapters/cuda/event.cpp | 2 +- source/adapters/hip/event.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index b6c6680a3d..ed73dccf2b 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -190,7 +190,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && !hEvent->isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 9de1036709..2329efb33a 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -250,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) || + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && !hEvent->isTimestampEvent()) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } From b6909206b464e8b1fd15a25100944ee904236ae4 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 29 Feb 2024 23:31:51 -0800 Subject: [PATCH 07/18] Fix faulty disjunction 2 Signed-off-by: Larsen, Steffen --- source/adapters/cuda/event.cpp | 4 ++-- source/adapters/hip/event.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index ed73dccf2b..c2c28f40d8 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -190,8 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && - !hEvent->isTimestampEvent()) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 2329efb33a..e871f8dad2 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -250,8 +250,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && - !hEvent->isTimestampEvent()) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } From 5db5ba85978ac5237faecdc974e314c96b8dc216 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 1 Mar 2024 02:45:52 -0800 Subject: [PATCH 08/18] Allow event creation to record timing for events Signed-off-by: Larsen, Steffen --- source/adapters/cuda/event.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 8fa5530e1b..5ed68f0f25 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -87,13 +87,14 @@ struct ur_event_handle_t_ { static ur_event_handle_t makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, uint32_t StreamToken = std::numeric_limits::max()) { - const bool ProfilingEnabled = - Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + const bool RequiresTimings = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || + Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr; UR_CHECK_ERROR(cuEventCreate( - &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - if (ProfilingEnabled) { + if (RequiresTimings) { UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); } From 2f0e0509ab644895e7e587d19923b37ad8ca0cd1 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 5 Mar 2024 01:11:28 -0800 Subject: [PATCH 09/18] Address comments Signed-off-by: Larsen, Steffen --- scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst | 2 +- scripts/core/exp-enqueue-timestamp-recording.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst index 25407d51d6..3cf46095b2 100644 --- a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst +++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst @@ -23,7 +23,7 @@ Enqueue Timestamp Recording Motivation -------------------------------------------------------------------------------- Currently, the only way to get timestamp information is through enabling -profiling on a queue and retrieve the information from events coming from +profiling on a queue and retrieving the information from events coming from commands submitted to it. However, not all systems give full control of the queue construction to the programmer wanting the profiling information. To amend this, this extension adds the ability to enqueue a timestamp recording on any diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml index 16fa700ff2..cac27ac9bd 100644 --- a/scripts/core/exp-enqueue-timestamp-recording.yml +++ b/scripts/core/exp-enqueue-timestamp-recording.yml @@ -52,7 +52,7 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. - type: $x_event_handle_t* name: phEvent desc: | From 5caceaf053cb43414b8ba208635695abea0046e6 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 12 Mar 2024 09:01:00 -0700 Subject: [PATCH 10/18] Amend comments Signed-off-by: Larsen, Steffen --- include/ur_api.h | 4 ++-- source/adapters/null/ur_nullddi.cpp | 4 ++-- source/loader/layers/tracing/ur_trcddi.cpp | 4 ++-- source/loader/layers/validation/ur_valddi.cpp | 4 ++-- source/loader/ur_ldrddi.cpp | 4 ++-- source/loader/ur_libapi.cpp | 4 ++-- source/ur_api.cpp | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index ce287105ec..229dc5d5a9 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -8774,8 +8774,8 @@ urEnqueueTimestampRecordingExp( uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried ///< from this event as if `hQueue` had profiling enabled. Querying diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index ea49a6c441..e48ba614e3 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5480,8 +5480,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index a3ce985777..8b6c321f13 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -6081,8 +6081,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index edd2f56506..76c6f40b23 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -8875,8 +8875,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 71e0717a6d..5616e9196d 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7611,8 +7611,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index ad6bf59e7c..ff896550b7 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8210,8 +8210,8 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 760dd0378a..d32567a8bb 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -6938,8 +6938,8 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried From d5aaba0cb76d3e2f2f35945b688cc50a0337bea1 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Mon, 8 Apr 2024 08:50:39 -0700 Subject: [PATCH 11/18] Fix merge mistake Signed-off-by: Larsen, Steffen --- include/ur_print.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/ur_print.hpp b/include/ur_print.hpp index bad74abf12..93c3b056ac 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -921,6 +921,8 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; break; default: + os << "unknown enumerator"; + break; } return os; } From 5fd441e004e492fa4970995f00be5b17566614ea Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 16 Apr 2024 02:16:07 -0700 Subject: [PATCH 12/18] Move timestamp query to after commandlist get Signed-off-by: Larsen, Steffen --- source/adapters/level_zero/event.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index bd406311e9..d2f4d3e482 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -664,9 +664,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_device_handle_t Device = Queue->Device; - uint64_t DeviceStartTimestamp = 0; - UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); - bool UseCopyEngine = false; _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( @@ -682,6 +679,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /* IsInternal */ false, /* HostVisible */ true)); ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent; (*OutEvent)->WaitList = TmpWaitList; + + uint64_t DeviceStartTimestamp = 0; + UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; // Allocate new entry in the queue's recordings. From 2404fe647d40777defbd738156294c328f796aa1 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 16 Apr 2024 02:33:01 -0700 Subject: [PATCH 13/18] Stop making new heap allocations for each recording Signed-off-by: Larsen, Steffen --- source/adapters/level_zero/event.cpp | 20 +++++++++----------- source/adapters/level_zero/queue.cpp | 3 +-- source/adapters/level_zero/queue.hpp | 6 ++---- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index d2f4d3e482..a6c8b8cc66 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -506,7 +506,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( // End time needs to be adjusted for resolution and valid bits. uint64_t ContextEndTime = - ((*EndTimeRecording.RecordEventEndTimestamp) & TimestampMaxValue) * + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * ZeTimerResolution; // If the result is 0, we have not yet gotten results back and so we just @@ -525,7 +525,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( // anymore, so we cache it on the event and evict the record from the // queue. Event->RecordEventEndTimestamp = ContextEndTime; - free(EndTimeRecording.RecordEventEndTimestamp); Event->UrQueue->EndTimeRecordings.erase(Entry); return ReturnValue(ContextEndTime); @@ -684,15 +683,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; - // Allocate new entry in the queue's recordings. - uint64_t *EndTimestampPtr = (uint64_t *)malloc(sizeof(uint64_t)); - *EndTimestampPtr = 0; - Queue->EndTimeRecordings[*OutEvent] = ur_queue_handle_t_::end_time_recording{ - EndTimestampPtr, /*EventHasDied=*/false}; + // Create a new entry in the queue's recordings. + Queue->EndTimeRecordings[*OutEvent] = + ur_queue_handle_t_::end_time_recording{}; ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (CommandList->first, EndTimestampPtr, ZeEvent, - (*OutEvent)->WaitList.Length, (*OutEvent)->WaitList.ZeEventList)); + (CommandList->first, + &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp, + ZeEvent, (*OutEvent)->WaitList.Length, + (*OutEvent)->WaitList.ZeEventList)); UR_CALL( Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false)); @@ -999,13 +998,12 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { auto Entry = Queue->EndTimeRecordings.find(Event); if (Entry != Queue->EndTimeRecordings.end()) { auto &EndTimeRecording = Entry->second; - if ((EndTimeRecording.RecordEventEndTimestamp) == 0) { + if (EndTimeRecording.RecordEventEndTimestamp == 0) { // If the end time recording has not finished, we tell the queue that // the event is no longer alive to avoid invalid write-backs. EndTimeRecording.EventHasDied = true; } else { // Otherwise we evict the entry. - free(EndTimeRecording.RecordEventEndTimestamp); Event->UrQueue->EndTimeRecordings.erase(Entry); } } diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index a8765cb6c7..f144353dd1 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1309,7 +1309,7 @@ void ur_queue_handle_t_::clearEndTimeRecordings() { if (!Entry.second.EventHasDied) { // Write the result back to the event if it is not dead. uint64_t ContextEndTime = - ((*EndTimeRecording.RecordEventEndTimestamp) & TimestampMaxValue) * + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * ZeTimerResolution; // Handle a possible wrap-around (the underlying HW counter is < 64-bit). @@ -1322,7 +1322,6 @@ void ur_queue_handle_t_::clearEndTimeRecordings() { // Store it in the event. Event->RecordEventEndTimestamp = ContextEndTime; } - free(EndTimeRecording.RecordEventEndTimestamp); } EndTimeRecordings.clear(); } diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 70a3c169db..10ef1d42b5 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -353,10 +353,8 @@ struct ur_queue_handle_t_ : _ur_object { // end-time member. struct end_time_recording { // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as - // it is written asynchronously. We use a heap allocation for these so they - // do not move after the timestamp recording is given the address to write - // the result to. - uint64_t *RecordEventEndTimestamp = nullptr; + // it is written asynchronously. + uint64_t RecordEventEndTimestamp = 0; // The event may die before the recording has been written back. In this // case the event will mark this for deletion when the queue sees fit. bool EventHasDied = false; From 73de142e0dc0a1cc6ed67fb6b6f0130438072b5d Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Wed, 17 Apr 2024 06:20:30 -0700 Subject: [PATCH 14/18] Fix diff Signed-off-by: Larsen, Steffen --- source/loader/layers/validation/ur_valddi.cpp | 4 ---- tools/urinfo/urinfo.hpp | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index a40927d1d3..0d8d041487 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -496,10 +496,6 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName) { - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 2dfd2f37c2..c7d3c46971 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -401,6 +401,7 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo( hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP); + std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); } From 4f0bf8c0b3625f76358806287dffd68989916c3c Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 18 Apr 2024 06:28:04 -0700 Subject: [PATCH 15/18] Fix diff Signed-off-by: Larsen, Steffen --- include/ur_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 16df84b803..602398b1ff 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -1631,7 +1631,7 @@ typedef enum ur_device_info_t { ///< accessing cubemap resources UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped ///< images across face boundaries - UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device supports timestamp recording + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device supports timestamp recording /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1657,7 +1657,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE From ecb6a8248a1c2b458a102454884bc1f78c3f0b7f Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Wed, 24 Apr 2024 02:29:26 -0700 Subject: [PATCH 16/18] Remove old use of urPrint Signed-off-by: Larsen, Steffen --- source/adapters/level_zero/event.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 2f02ea5cf2..873e58eb18 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(ContextEndTime); } default: - urPrint("urEventGetProfilingInfo: not supported ParamName\n"); + logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; } } From 06432bf0d2e4fa238a949b638324c39471894c33 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 7 May 2024 02:56:22 -0700 Subject: [PATCH 17/18] Remove trailing ws Signed-off-by: Larsen, Steffen --- source/adapters/level_zero/event.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index 0a0f2303b7..2d1f536e4e 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -206,7 +206,7 @@ struct ur_event_handle_t_ : _ur_object { // Indicates within creation of proxy event. bool IsCreatingHostProxyEvent = {false}; - + // Indicates the recorded start and end timestamps for the event. These are // only set for events returned by timestamp recording enqueue functions. // A non-zero value for RecordEventStartTimestamp indicates the event was the From 84bad6c76c2c7ae49678892f763efd1e3155aca5 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Tue, 7 May 2024 02:59:53 -0700 Subject: [PATCH 18/18] Add wait-list to get-command-list Signed-off-by: Larsen, Steffen --- source/adapters/level_zero/event.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index b24f2f395c..9821333547 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -678,7 +678,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, /* AllowBatching */ false)); + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + /* AllowBatching */ false)); UR_CALL(createEventAndAssociateQueue( Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList,