From ad529c3faaf39cf42ae2667cf9cef3d86bc8b90f Mon Sep 17 00:00:00 2001 From: Raiyan Latif Date: Thu, 29 Feb 2024 05:32:45 -0800 Subject: [PATCH] [L0] Add support for in-order lists using L0 driver Signed-off-by: Raiyan Latif --- source/adapters/level_zero/device.cpp | 13 ++++++ source/adapters/level_zero/device.hpp | 3 ++ source/adapters/level_zero/event.cpp | 59 +++++++++++++++++++++------ source/adapters/level_zero/kernel.cpp | 4 +- source/adapters/level_zero/queue.cpp | 10 ++++- 5 files changed, 72 insertions(+), 17 deletions(-) diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 2f6b3a91ff..fcb4fd392c 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1054,6 +1054,19 @@ bool ur_device_handle_t_::useRelaxedAllocationLimits() { return EnableRelaxedAllocationLimits; } +bool ur_device_handle_t_::useDriverInOrderLists() { + // Use in-order lists implementation from L0 driver instead + // of adapter's implementation. + static const bool UseDriverInOrderLists = [] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); + if (!UrRet) + return false; + return std::atoi(UrRet) != 0; + }(); + + return UseDriverInOrderLists; +} + ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { // Maintain various device properties cache. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 94480336c5..a57a97d38d 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -143,6 +143,9 @@ struct ur_device_handle_t_ : _ur_object { // Read env settings to select immediate commandlist mode. ImmCmdlistMode useImmediateCommandLists(); + // Whether Adapter uses driver's implementation of in-order lists or not + bool useDriverInOrderLists(); + // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 57b839a714..856368d742 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -43,6 +43,19 @@ static const bool UseMultipleCmdlistBarriers = [] { return std::atoi(UseMultipleCmdlistBarriersFlag) > 0; }(); +bool WaitListEmptyOrAllEventsFromSameQueue( + ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + if (!NumEventsInWaitList) + return true; + + for (uint32_t i = 0; i < NumEventsInWaitList; ++i) + if (Queue != EventWaitList[i]->UrQueue) + return false; + + return true; +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( ur_queue_handle_t Queue, ///< [in] handle of the queue object uint32_t NumEventsInWaitList, ///< [in] size of the event wait list @@ -206,21 +219,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; - auto WaitListEmptyOrAllEventsFromSameQueue = [Queue, NumEventsInWaitList, - EventWaitList]() { - if (!NumEventsInWaitList) - return true; - - for (uint32_t I = 0; I < NumEventsInWaitList; ++I) - if (Queue != EventWaitList[I]->UrQueue) - return false; - - return true; - }; - // For in-order queue and wait-list which is empty or has events from // the same queue just use the last command event as the barrier event. - if (Queue->isInOrderQueue() && WaitListEmptyOrAllEventsFromSameQueue() && + if (Queue->isInOrderQueue() && + WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList, + EventWaitList) && Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) { UR_CALL(urEventRetain(Queue->LastCommandEvent)); *Event = Queue->LastCommandEvent; @@ -1189,6 +1192,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false; + // If we are using L0 native implementation for handling in-order queues, + // then we don't need to add the last enqueued event into the waitlist, as + // the native driver implementation will already ensure in-order semantics. + // The only exception is when a different immediate command was last used on + // the same UR Queue. + if (CurQueue->Device->useDriverInOrderLists() && CurQueue->isInOrderQueue()) { + if (CurQueue->UsingImmCmdLists) { + auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + uint32_t QueueGroupOrdinal, QueueIndex; + auto NextIndex = QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, + /*QueryOnly */ true); + auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; + IncludeLastCommandEvent &= + CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && + NextImmCmdList != CurQueue->LastUsedCommandList; + } else { + IncludeLastCommandEvent = false; + } + } + try { uint32_t TmpListLength = 0; @@ -1205,6 +1228,16 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList = new ur_event_handle_t[EventListLength]; } + // For in-order queue and wait-list which is empty or has events only from + // the same queue then we don't need to wait on any other additional events + if (CurQueue->Device->useDriverInOrderLists() && + CurQueue->isInOrderQueue() && + WaitListEmptyOrAllEventsFromSameQueue(CurQueue, EventListLength, + EventList)) { + this->Length = TmpListLength; + return UR_RESULT_SUCCESS; + } + if (EventListLength > 0) { for (uint32_t I = 0; I < EventListLength; I++) { { diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 0e5ce3215a..c40e4ef0e3 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -214,8 +214,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // the code can do a urKernelRelease on this kernel. (*Event)->CommandData = (void *)Kernel; - // Increment the reference count of the Kernel and indicate that the Kernel is - // in use. Once the event has been signalled, the code in + // Increment the reference count of the Kernel and indicate that the Kernel + // is in use. Once the event has been signalled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. UR_CALL(urKernelRetain(Kernel)); diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 2009c3c6f5..832feb628c 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1870,6 +1870,10 @@ ur_result_t ur_queue_handle_t_::createCommandList( ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + if (Device->useDriverInOrderLists() && isInOrderQueue()) { + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + } + ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); @@ -1983,8 +1987,10 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { Priority = "High"; } - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { + if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + } else if (QueueIndex != 0) { + // Evaluate performance of explicit usage for "0" index. ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; }