diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 4b811ab033..bbe49cb705 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -45,13 +45,13 @@ │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │ └──────────┴────────────────────────────────────────────────┴─────────┘ - ┌───────────────────┬──────────────────────────────┐ - Prefix │Reset signal event │ Barrier waiting on wait event│ - └───────────────────┴──────────────────────────────┘ + ┌───────────────────┬──────────────┐──────────────────────────────┐ + Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│ + └───────────────────┴──────────────┘──────────────────────────────┘ ┌─────────────────────────────────────────────┐──────────────┐ - Suffix │Barrier waiting on sync-point event, │ Reset events │ - │signalling the UR command-buffer signal event│ │ + Suffix │Barrier waiting on sync-point event, │ Query CMD │ + │signalling the UR command-buffer signal event│ Timestamps │ └─────────────────────────────────────────────┘──────────────┘ For a call to `urCommandBufferEnqueueExp` with an event_list `EL`, @@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + // Dependencies between commands are explicitly enforced by sync points when + // enqueuing. Consequently, relax the command ordering in the command list + // can enable the backend to further optimize the workload + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; ze_command_list_handle_t ZeCommandList; // TODO We could optimize this by pooling both Level Zero command-lists and UR @@ -494,18 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent, NumEvents, WaitEventList.data())); - // Reset the wait-event for the UR command-buffer that is signalled when its - // submission dependencies have been satisfied. - ZE2UR_CALL(zeCommandListAppendEventReset, - (CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent)); - - // Reset the L0 events we use for command-buffer internal sync-points to the - // non-signalled state - for (auto Event : WaitEventList) { - ZE2UR_CALL(zeCommandListAppendEventReset, - (CommandBuffer->ZeCommandList, Event)); - } - // Close the command list and have it ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); return UR_RESULT_SUCCESS; @@ -899,14 +891,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create command-list to execute before `CommandListPtr` and will signal // when `EventWaitList` dependencies are complete. ur_command_list_ptr_t WaitCommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false, + false)); + + // Create a list of events of all the events that compose the command buffer + // workload. + // This loop also resets the L0 events we use for command-buffer internal + // sync-points to the non-signalled state. + // This is required for multiple submissions. + const size_t NumEvents = CommandBuffer->SyncPoints.size(); + std::vector WaitEventList{NumEvents}; + for (size_t i = 0; i < NumEvents; i++) { + auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; + WaitEventList[i] = ZeEvent; + ZE2UR_CALL(zeCommandListAppendEventReset, + (WaitCommandList->first, ZeEvent)); + } + if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, - false, false)) - // Update the WaitList of the Wait Event // Events are appended to the WaitList if the WaitList is not empty if (CommandBuffer->WaitEvent->WaitList.isEmpty()) @@ -919,9 +925,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( CommandBuffer->WaitEvent->WaitList.Length, CommandBuffer->WaitEvent->WaitList.ZeEventList)); } else { - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, - false, false)); - ZE2UR_CALL(zeCommandListAppendSignalEvent, (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); } @@ -930,17 +933,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_event_handle_t RetEvent{}; // Create a command-list to signal RetEvent on completion ur_command_list_ptr_t SignalCommandList{}; - if (Event) { - UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, - false, false)); + UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, + false, false)); + // Reset the wait-event for the UR command-buffer that is signalled when its + // submission dependencies have been satisfied. + ZE2UR_CALL(zeCommandListAppendEventReset, + (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + if (Event) { UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, SignalCommandList, false)); - ZE2UR_CALL(zeCommandListAppendBarrier, - (SignalCommandList->first, RetEvent->ZeEvent, 1, - &(CommandBuffer->SignalEvent->ZeEvent))); + if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + // Multiple submissions of a command buffer implies that we need to save + // the event timestamps before resubmiting the command buffer. We + // therefore copy the these timestamps in a dedicated USM memory section + // before completing the command buffer execution, and then attach this + // memory to the event returned to users to allow to allow the profiling + // engine to recover these timestamps. + command_buffer_profiling_t *Profiling = new command_buffer_profiling_t(); + + Profiling->NumEvents = WaitEventList.size(); + Profiling->Timestamps = + new ze_kernel_timestamp_result_t[Profiling->NumEvents]; + + ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, + (SignalCommandList->first, WaitEventList.size(), + WaitEventList.data(), (void *)Profiling->Timestamps, 0, + RetEvent->ZeEvent, 1, + &(CommandBuffer->SignalEvent->ZeEvent))); + + RetEvent->CommandData = static_cast(Profiling); + } else { + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, RetEvent->ZeEvent, 1, + &(CommandBuffer->SignalEvent->ZeEvent))); + } } // Execution our command-lists asynchronously diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index b18f1c3497..a43e9e4c52 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -19,6 +19,11 @@ #include "context.hpp" #include "queue.hpp" +struct command_buffer_profiling_t { + ur_exp_command_buffer_sync_point_t NumEvents; + ze_kernel_timestamp_result_t *Timestamps; +}; + struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_exp_command_buffer_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device, diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index d8af1e674d..2dc74ff5ac 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -13,6 +13,7 @@ #include #include +#include "command_buffer.hpp" #include "common.hpp" #include "event.hpp" #include "ur_level_zero.hpp" @@ -454,6 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ///< bytes returned in propValue ) { std::shared_lock EventLock(Event->Mutex); + if (Event->UrQueue && (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; @@ -470,6 +472,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ze_kernel_timestamp_result_t tsResult; + // A Command-buffer consists of three command-lists for which only a single + // event is returned to users. The actual profiling information related to the + // command-buffer should therefore be extrated from graph events themsleves. + // The timestamps of these events are saved in a memory region attached to + // event usning CommandData field. The timings must therefore be recovered + // from this memory. + if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) { + if (Event->CommandData) { + command_buffer_profiling_t *ProfilingsPtr; + switch (PropName) { + case UR_PROFILING_INFO_COMMAND_START: { + ProfilingsPtr = + static_cast(Event->CommandData); + // Sync-point order does not necessarily match to the order of + // execution. We therefore look for the first command executed. + uint64_t MinStart = ProfilingsPtr->Timestamps[0].global.kernelStart; + for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) { + uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart; + if (Timestamp < MinStart) { + MinStart = Timestamp; + } + } + uint64_t ContextStartTime = + (MinStart & TimestampMaxValue) * ZeTimerResolution; + return ReturnValue(ContextStartTime); + } + case UR_PROFILING_INFO_COMMAND_END: { + ProfilingsPtr = + static_cast(Event->CommandData); + // Sync-point order does not necessarily match to the order of + // execution. We therefore look for the last command executed. + uint64_t MaxEnd = ProfilingsPtr->Timestamps[0].global.kernelEnd; + uint64_t LastStart = ProfilingsPtr->Timestamps[0].global.kernelStart; + for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) { + uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd; + if (Timestamp > MaxEnd) { + MaxEnd = Timestamp; + LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart; + } + } + uint64_t ContextStartTime = (LastStart & TimestampMaxValue); + uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue); + + // + // Handle a possible wrap-around (the underlying HW counter is < + // 64-bit). Note, it will not report correct time if there were multiple + // wrap arounds, and the longer term plan is to enlarge the capacity of + // the HW timestamps. + // + if (ContextEndTime <= ContextStartTime) { + ContextEndTime += TimestampMaxValue; + } + ContextEndTime *= ZeTimerResolution; + return ReturnValue(ContextEndTime); + } + default: + urPrint("urEventGetProfilingInfo: not supported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + } + switch (PropName) { case UR_PROFILING_INFO_COMMAND_START: { ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult)); @@ -763,6 +829,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { return Res; Event->CommandData = nullptr; } + if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP && + Event->CommandData) { + // Free the memory extra event allocated for profiling purposed. + command_buffer_profiling_t *ProfilingPtr = + static_cast(Event->CommandData); + delete[] ProfilingPtr->Timestamps; + delete ProfilingPtr; + Event->CommandData = nullptr; + } if (Event->OwnNativeHandle) { if (DisableEventsCaching) { auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));