From 367f890c4e17425b485cda265541c3242253f4cb Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Tue, 20 Feb 2024 16:52:34 +0000 Subject: [PATCH] [EXP][CMDBUF] Move event reset commands to dedicated cmd-list Create a new command-list to reset all the events of the enqueued graph cmd. This allows us to move the enqueueing of reset commands from the command-buffer enqueue function to the finalize function, and allow us to perform this task only once. Move signal event reset from main cmd-list to reset cmd-list. Move the documentation to DPC++ design doc. --- source/adapters/level_zero/command_buffer.cpp | 178 +++++++----------- source/adapters/level_zero/command_buffer.hpp | 18 +- 2 files changed, 82 insertions(+), 114 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7dc2a42fd6..ced2d0286b 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -10,88 +10,18 @@ #include "command_buffer.hpp" #include "ur_level_zero.hpp" -/* Command-buffer Extension - - The UR interface for submitting a UR command-buffer takes a list - of events to wait on, and returns an event representing the completion of - that particular submission of the command-buffer. - - However, in `zeCommandQueueExecuteCommandLists` there are no parameters to - take a waitlist and also the only sync primitive returned is to block on - host. - - In order to get the UR command-buffer enqueue semantics we want with L0 - this adapter adds extra commands to the L0 command-list representing a - UR command-buffer. - - Prefix - Commands added to the start of the L0 command-list by L0 adapter. - Suffix - Commands added to the end of the L0 command-list by L0 adapter. - - These extra commands operate on L0 event synchronisation primitives used by - the command-list to interact with the external UR wait-list and UR return - event required for the enqueue interface. - - The `ur_exp_command_buffer_handle_t` class for this adapter contains a - SignalEvent which signals the completion of the command-list in the suffix, - and is reset in the prefix. This signal is detected by a new UR return event - created on UR command-buffer enqueue. - - There is also a WaitEvent used by the `ur_exp_command_buffer_handle_t` class - in the prefix to wait on any dependencies passed in the enqueue wait-list. - This WaitEvent is reset at the end of the suffix, along with reset commands - to reset the L0 events used to implement the UR sync-points. - - ┌──────────┬────────────────────────────────────────────────┬─────────┐ - │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │ - └──────────┴────────────────────────────────────────────────┴─────────┘ - - ┌───────────────────┬──────────────┐──────────────────────────────┐ - Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│ - └───────────────────┴──────────────┘──────────────────────────────┘ - - ┌─────────────────────────────────────────────┐──────────────┐ - Suffix │Barrier waiting on sync-point event, │ Query CMD │ - │signaling the UR command-buffer signal event │ Timestamps │ - └─────────────────────────────────────────────┘──────────────┘ - - For a call to `urCommandBufferEnqueueExp` with an event_list `EL`, - command-buffer `CB`, and return event `RE` our implementation has to create - and submit two new command-lists for the above approach to work. One before - the command-list with extra commands associated with `CB`, and the other - after `CB`. - - Command-list created on `urCommandBufferEnqueueExp` to execution before `CB`: - ┌───────────────────────────────────────────────────────────┐ - │Barrier on `EL` than signals `CB` WaitEvent when completed │ - └───────────────────────────────────────────────────────────┘ - - Command-list created on `urCommandBufferEnqueueExp` to execution after `CB`: - ┌─────────────────────────────────────────────────────────────┐ - │Barrier on `CB` SignalEvent that signals `RE` when completed │ - └─────────────────────────────────────────────────────────────┘ - -Drawbacks ---------- - -There are two drawbacks to this approach: - -1. We use 3x the command-list resources, if there are many UR command-buffers -in flight, this may exhaust L0 driver resources. - -2. Each command list is submitted individually with a -`ur_queue_handle_t_::executeCommandList` call which introduces serialization in -the submission pipeline that is heavier than having a barrier or a -waitForEvents on the same list. Resulting in additional latency when executing -graphs. - +/* L0 Command-buffer Extension Doc see: +https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero */ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, + ze_command_list_handle_t CommandListResetEvents, ZeStruct ZeDesc, const ur_exp_command_buffer_desc_t *Desc) : Context(Context), Device(Device), ZeCommandList(CommandList), + ZeCommandListResetEvents(CommandListResetEvents), ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(), SyncPoints(), NextSyncPoint(0) { (void)Desc; @@ -114,6 +44,12 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); } + // Release the memory allocated to the CommandListResetEvents stored in the + // command_buffer + if (ZeCommandListResetEvents) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents)); + } + // Release additional signal and wait events used by command_buffer if (SignalEvent) { CleanupCompletedEvent(SignalEvent, false); @@ -123,6 +59,10 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { CleanupCompletedEvent(WaitEvent, false); urEventReleaseInternal(WaitEvent); } + if (AllResetEvent) { + CleanupCompletedEvent(AllResetEvent, false); + urEventReleaseInternal(AllResetEvent); + } // Release events added to the command_buffer for (auto &Sync : SyncPoints) { @@ -434,6 +374,13 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ze_command_list_handle_t ZeCommandListResetEvents; + // Create a command-list for reseting the events associated to enqueued cmd. + ZE2UR_CALL(zeCommandListCreate, + (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, + &ZeCommandListResetEvents)); + // Dependencies between commands are explicitly enforced by sync points when // enqueuing. Consequently, relax the command ordering in the command list // can enable the backend to further optimize the workload @@ -446,7 +393,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, &ZeCommandListDesc, &ZeCommandList)); try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( - Context, Device, ZeCommandList, ZeCommandListDesc, CommandBufferDesc); + Context, Device, ZeCommandList, ZeCommandListResetEvents, + ZeCommandListDesc, CommandBufferDesc); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -460,13 +408,19 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, &RetCommandBuffer->SignalEvent)); UR_CALL(EventCreate(Context, nullptr, false, false, &RetCommandBuffer->WaitEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->AllResetEvent)); // Add prefix commands - ZE2UR_CALL(zeCommandListAppendEventReset, - (ZeCommandList, RetCommandBuffer->SignalEvent->ZeEvent)); + ZE2UR_CALL( + zeCommandListAppendEventReset, + (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent->ZeEvent)); + std::vector PrecondEvents = { + RetCommandBuffer->WaitEvent->ZeEvent, + RetCommandBuffer->AllResetEvent->ZeEvent}; ZE2UR_CALL( zeCommandListAppendBarrier, - (ZeCommandList, nullptr, 1, &RetCommandBuffer->WaitEvent->ZeEvent)); + (ZeCommandList, nullptr, PrecondEvents.size(), PrecondEvents.data())); return UR_RESULT_SUCCESS; } @@ -488,20 +442,29 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { // Create a list of events for our signal event to wait on + // This loop also resets the L0 events we use for command-buffer internal + // sync-points to the non-signaled state. + // This is required for multiple submissions. const size_t NumEvents = CommandBuffer->SyncPoints.size(); - std::vector WaitEventList{NumEvents}; for (size_t i = 0; i < NumEvents; i++) { - WaitEventList[i] = CommandBuffer->SyncPoints[i]->ZeEvent; + auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; + CommandBuffer->ZeEventsList.push_back(ZeEvent); + ZE2UR_CALL(zeCommandListAppendEventReset, + (CommandBuffer->ZeCommandListResetEvents, ZeEvent)); } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandListResetEvents, + CommandBuffer->AllResetEvent->ZeEvent)); // Wait for all the user added commands to complete, and signal the // command-buffer signal-event when they are done. ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent, - NumEvents, WaitEventList.data())); + NumEvents, CommandBuffer->ZeEventsList.data())); - // Close the command list and have it ready for dispatch. + // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); return UR_RESULT_SUCCESS; } @@ -875,26 +838,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); CommandBuffer->ZeFencesList.push_back(ZeFence); - // Create command-list to execute before `CommandListPtr` and will signal - // when `EventWaitList` dependencies are complete. - ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false, - false)); - - // Create a list of events of all the events that compose the command buffer - // workload. - // This loop also resets the L0 events we use for command-buffer internal - // sync-points to the non-signaled state. - // This is required for multiple submissions. - const size_t NumEvents = CommandBuffer->SyncPoints.size(); - std::vector WaitEventList{NumEvents}; - for (size_t i = 0; i < NumEvents; i++) { - auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; - WaitEventList[i] = ZeEvent; - ZE2UR_CALL(zeCommandListAppendEventReset, - (WaitCommandList->first, ZeEvent)); - } - bool MustSignalWaitEvent = true; if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; @@ -909,18 +852,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { + // Create command-list to execute before `CommandListPtr` and will signal + // when `EventWaitList` dependencies are complete. + ur_command_list_ptr_t WaitCommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, + false, false)); + ZE2UR_CALL(zeCommandListAppendBarrier, (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, CommandBuffer->WaitEvent->WaitList.Length, CommandBuffer->WaitEvent->WaitList.ZeEventList)); + Queue->executeCommandList(WaitCommandList, false, false); MustSignalWaitEvent = false; } } if (MustSignalWaitEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + ZE2UR_CALL(zeEventHostSignal, (CommandBuffer->WaitEvent->ZeEvent)); } - Queue->executeCommandList(WaitCommandList, false, false); + + // Submit reset events command-list. This command-list is of a batch + // command-list type, regardless of the UR Queue type. We therefore need to + // submit the list directly using the Level-Zero API to avoid type mismatches + // if using UR functions. + ZE2UR_CALL( + zeCommandQueueExecuteCommandLists, + (ZeCommandQueue, 1, &CommandBuffer->ZeCommandListResetEvents, nullptr)); // Submit main command-list. This command-list is of a batch command-list // type, regardless of the UR Queue type. We therefore need to submit the list @@ -940,6 +896,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + // Reset the all-reset-event for the UR command-buffer that is signaled when + // all events of the main command-list have been reset. + ZE2UR_CALL(zeCommandListAppendEventReset, + (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent)); if (Event) { UR_CALL(createEventAndAssociateQueue( @@ -955,14 +915,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // engine to recover these timestamps. command_buffer_profiling_t *Profiling = new command_buffer_profiling_t(); - Profiling->NumEvents = WaitEventList.size(); + Profiling->NumEvents = CommandBuffer->ZeEventsList.size(); Profiling->Timestamps = new ze_kernel_timestamp_result_t[Profiling->NumEvents]; ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, - (SignalCommandList->first, WaitEventList.size(), - WaitEventList.data(), (void *)Profiling->Timestamps, 0, - RetEvent->ZeEvent, 1, + (SignalCommandList->first, CommandBuffer->ZeEventsList.size(), + CommandBuffer->ZeEventsList.data(), + (void *)Profiling->Timestamps, 0, RetEvent->ZeEvent, 1, &(CommandBuffer->SignalEvent->ZeEvent))); RetEvent->CommandData = static_cast(Profiling); diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 9fa0149d22..843d9d3f37 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -25,11 +25,12 @@ struct command_buffer_profiling_t { }; struct ur_exp_command_buffer_handle_t_ : public _ur_object { - ur_exp_command_buffer_handle_t_(ur_context_handle_t Context, - ur_device_handle_t Device, - ze_command_list_handle_t CommandList, - ZeStruct ZeDesc, - const ur_exp_command_buffer_desc_t *Desc); + ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ze_command_list_handle_t CommandListResetEvents, + ZeStruct ZeDesc, + const ur_exp_command_buffer_desc_t *Desc); ~ur_exp_command_buffer_handle_t_(); @@ -49,6 +50,8 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_device_handle_t Device; // Level Zero command list handle ze_command_list_handle_t ZeCommandList; + // Level Zero command list handle + ze_command_list_handle_t ZeCommandListResetEvents; // Level Zero command list descriptor ZeStruct ZeCommandListDesc; // List of Level Zero fences created when submitting a graph. @@ -64,10 +67,15 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) ur_exp_command_buffer_sync_point_t NextSyncPoint; + // List of Level Zero events associated to submitted commands. + std::vector ZeEventsList; // Event which will signals the most recent execution of the command-buffer // has finished ur_event_handle_t SignalEvent = nullptr; // Event which a command-buffer waits on until the wait-list dependencies // passed to a command-buffer enqueue have been satisfied. ur_event_handle_t WaitEvent = nullptr; + // Event which a command-buffer waits on until the main command-list event + // have been reset. + ur_event_handle_t AllResetEvent = nullptr; };