From 9b3bf624cc4eb32115c474d69676a27305398641 Mon Sep 17 00:00:00 2001 From: Raiyan Latif Date: Thu, 22 Feb 2024 01:15:10 -0800 Subject: [PATCH] [L0] Add support for in-order lists using L0 driver Signed-off-by: Raiyan Latif --- source/adapters/level_zero/command_buffer.cpp | 2 +- source/adapters/level_zero/context.cpp | 10 +-- source/adapters/level_zero/device.cpp | 18 +++++ source/adapters/level_zero/device.hpp | 3 + source/adapters/level_zero/event.cpp | 17 +++-- source/adapters/level_zero/event.hpp | 3 +- source/adapters/level_zero/kernel.cpp | 70 +++++++++++++------ source/adapters/level_zero/memory.cpp | 17 +++-- source/adapters/level_zero/queue.cpp | 15 ++-- 9 files changed, 111 insertions(+), 44 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7dc2a42fd6..5069fff7b2 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -899,7 +899,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // Update the WaitList of the Wait Event // Events are appended to the WaitList if the WaitList is not empty diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index a094ace868..4a8ed2d2ee 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -397,10 +397,12 @@ ur_result_t ur_context_handle_t_::finalize() { std::scoped_lock Lock(EventCacheMutex); for (auto EventCache : EventCaches) { for (auto Event : *EventCache) { - auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); + if (Event->ZeEvent) { + auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } delete Event; } EventCache->clear(); diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 918b04400a..f7d0924eeb 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1029,6 +1029,24 @@ bool ur_device_handle_t_::useRelaxedAllocationLimits() { return EnableRelaxedAllocationLimits; } +bool ur_device_handle_t_::useDriverInOrderLists() { + // Use in-order lists implementation from L0 driver instead + // of adapter's implementation. + static const bool UseDriverInOrderLists = [this] { + return true; +#if 0 + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); + if (!UrRet) + return false; + if (this->useImmediateCommandLists() == 0) + return false; + return std::atoi(UrRet) != 0; +#endif + }(); + + return UseDriverInOrderLists; +} + ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { // Maintain various device properties cache. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 94480336c5..a57a97d38d 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -143,6 +143,9 @@ struct ur_device_handle_t_ : _ur_object { // Read env settings to select immediate commandlist mode. ImmCmdlistMode useImmediateCommandLists(); + // Whether Adapter uses driver's implementation of in-order lists or not + bool useDriverInOrderLists(); + // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 57b839a714..08eafb0eb1 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -64,7 +64,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( _ur_ze_event_list_t TmpWaitList = {}; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; @@ -244,7 +244,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Retain the events as they will be owned by the result event. _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/)); + NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/, + nullptr)); // Get an arbitrary command-list in the queue. ur_command_list_ptr_t CmdList; @@ -342,7 +343,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( UR_CALL(BaseWaitList.createAndRetainUrZeEventList( EventWaitVector.size(), reinterpret_cast(EventWaitVector.data()), - Queue, ConvergenceCmdList->second.isCopy(Queue))); + Queue, ConvergenceCmdList->second.isCopy(Queue), nullptr)); // Insert a barrier with the events from each command-queue into the // convergence command list. The resulting event signals the convergence of @@ -1129,13 +1130,16 @@ ur_result_t ur_event_handle_t_::reset() { if (!isHostVisible()) HostVisibleEvent = nullptr; - ZE2UR_CALL(zeEventHostReset, (ZeEvent)); + if (ZeEvent) { + ZE2UR_CALL(zeEventHostReset, (ZeEvent)); + } return UR_RESULT_SUCCESS; } ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( uint32_t EventListLength, const ur_event_handle_t *EventList, - ur_queue_handle_t CurQueue, bool UseCopyEngine) { + ur_queue_handle_t CurQueue, bool UseCopyEngine, + bool *LastCommandEventIncluded) { this->Length = 0; this->ZeEventList = nullptr; this->UrEventList = nullptr; @@ -1195,6 +1199,9 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( if (IncludeLastCommandEvent) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->UrEventList = new ur_event_handle_t[EventListLength + 1]; + if (LastCommandEventIncluded) { + *LastCommandEventIncluded = true; + } std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent; this->UrEventList[0] = CurQueue->LastCommandEvent; diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index c266de8c0d..6377eaed6e 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -88,7 +88,8 @@ struct _ur_ze_event_list_t { ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength, const ur_event_handle_t *EventList, ur_queue_handle_t CurQueue, - bool UseCopyEngine); + bool UseCopyEngine, + bool *LastCommandEventIncluded); // Add all the events in this object's UrEventList to the end // of the list EventsToBeReleased. Destroy ur_ze_event_list_t data diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 0e5ce3215a..53d97c1e5b 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -191,34 +191,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); bool UseCopyEngine = false; + bool LastCommandEventIncluded = false; _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, + &LastCommandEventIncluded)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( Queue, CommandList, UseCopyEngine, true /* AllowBatching */)); + uint32_t WaitListLength = TmpWaitList.Length; + uint32_t WaitEventOffset = 0; + + if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { + LastCommandEventIncluded &= + Queue->LastUsedCommandList != Queue->CommandListMap.end() && + CommandList->first != Queue->LastUsedCommandList->first; + + if (!LastCommandEventIncluded && WaitListLength > 0) { + WaitListLength--; + WaitEventOffset = 1; + if (Queue->LastCommandEvent && Queue->LastCommandEvent->ZeEvent) { + ZE2UR_CALL(zeEventDestroy, (Queue->LastCommandEvent->ZeEvent)); + Queue->LastCommandEvent->ZeEvent = nullptr; + } + } + } + ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent{}; bool IsInternal = OutEvent == nullptr; - ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + ur_event_handle_t *Event = nullptr; + + bool SkipOutputEventCreate = + (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() && + WaitListLength == 0 && IsInternal); - UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, - CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + if (!SkipOutputEventCreate) { + Event = OutEvent ? OutEvent : &InternalEvent; - // Save the kernel in the event, so that when the event is signalled - // the code can do a urKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, + CommandList, IsInternal, false)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; - // Increment the reference count of the Kernel and indicate that the Kernel is - // in use. Once the event has been signalled, the code in - // CleanupCompletedEvent(Event) will do a urKernelRelease to update the - // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(urKernelRetain(Kernel)); + // Save the kernel in the event, so that when the event is signalled + // the code can do a urKernelRelease on this kernel. + (*Event)->CommandData = (void *)Kernel; + + // Increment the reference count of the Kernel and indicate that the Kernel + // is in use. Once the event has been signalled, the code in + // CleanupCompletedEvent(Event) will do a urKernelRelease to update the + // reference count on the kernel, using the kernel saved in CommandData. + UR_CALL(urKernelRetain(Kernel)); + } // Add to list of kernels to be submitted if (IndirectAccessTrackingEnabled) @@ -241,7 +269,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Add the command to the command list, which implies submission. ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, - (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); + WaitListLength, &TmpWaitList.ZeEventList[WaitEventOffset])); } else { // Add the command to the command list for later submission. // No lock is needed here, unlike the immediate commandlist case above, @@ -249,13 +277,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // submitted only when the comamndlist is closed. Then, a lock is held. ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, - (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); + WaitListLength, &TmpWaitList.ZeEventList[WaitEventOffset])); } - urPrint("calling zeCommandListAppendLaunchKernel() with" - " ZeEvent %#" PRIxPTR "\n", - ur_cast(ZeEvent)); - printZeEventList((*Event)->WaitList); + if (Event) { + urPrint("calling zeCommandListAppendLaunchKernel() with" + " ZeEvent %#" PRIxPTR "\n", + ur_cast(ZeEvent)); + printZeEventList((*Event)->WaitList); + } // Execute command list asynchronously, as the event will be used // to track down its completion. diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index e977d1ac15..a76ffa4f5b 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -52,7 +52,7 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; @@ -102,7 +102,7 @@ ur_result_t enqueueMemCopyRectHelper( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; @@ -214,7 +214,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); ur_command_list_ptr_t CommandList{}; // We want to batch these commands to avoid extra submissions (costly) @@ -346,7 +346,7 @@ static ur_result_t enqueueMemImageCommandHelper( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; @@ -909,7 +909,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); UR_CALL(createEventAndAssociateQueue( Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(), @@ -931,7 +931,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } else if (MapFlags & UR_MAP_FLAG_WRITE) AccessMode = ur_mem_handle_t_::write_only; } - UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown, UR_RESULT_ERROR_INVALID_VALUE); @@ -1067,7 +1066,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP, Queue->CommandListMap.end(), @@ -1247,7 +1246,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; @@ -1302,7 +1301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue, - UseCopyEngine)); + UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 8a9f36a432..b2d9d9f13b 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1427,7 +1427,9 @@ ur_result_t ur_queue_handle_t_::synchronize() { // zero handle can have device scope, so we can't synchronize the last // event. if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { - ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); + if (LastCommandEvent->ZeEvent) { + ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); + } // clean up all events known to have been completed as well, // so they can be reused later @@ -1892,6 +1894,8 @@ ur_result_t ur_queue_handle_t_::createCommandList( ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + if (Device->useDriverInOrderLists() && isInOrderQueue()) + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); @@ -1919,7 +1923,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, _ur_ze_event_list_t ActiveBarriersWaitList; UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList( ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), - reinterpret_cast(this), UseCopyEngine)); + reinterpret_cast(this), UseCopyEngine, nullptr)); // We can now replace active barriers with the ones in the wait list. UR_CALL(ActiveBarriers.clear()); @@ -2006,8 +2010,11 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { Priority = "High"; } - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { + if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + urPrint("Using in-order driver implementation\n"); + } else if (QueueIndex != 0) { + // Evaluate performance of explicit usage for "0" index. ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; }