diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 2f6b3a91ff..6e435b5a65 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1054,6 +1054,22 @@ bool ur_device_handle_t_::useRelaxedAllocationLimits() { return EnableRelaxedAllocationLimits; } +bool ur_device_handle_t_::useDriverInOrderLists() { + // Use in-order lists implementation from L0 driver instead + // of adapter's implementation. + static const bool UseDriverInOrderLists = [] { + // Temporary Enable by Default to test CI Results + return true; + + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); + if (!UrRet) + return false; + return std::atoi(UrRet) != 0; + }(); + + return UseDriverInOrderLists; +} + ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { // Maintain various device properties cache. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 94480336c5..a57a97d38d 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -143,6 +143,9 @@ struct ur_device_handle_t_ : _ur_object { // Read env settings to select immediate commandlist mode. ImmCmdlistMode useImmediateCommandLists(); + // Whether Adapter uses driver's implementation of in-order lists or not + bool useDriverInOrderLists(); + // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 57b839a714..c8f147f5a8 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -1189,6 +1189,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false; + // If we are using L0 native implementation for handling in-order queues, + // then we don't need to add the last enqueued event into the waitlist, as + // the native driver implementation will already ensure in-order semantics. + // The only exception is when a different immediate command was last used on + // the same UR Queue. + if (CurQueue->Device->useDriverInOrderLists()) { + if (CurQueue->UsingImmCmdLists) { + auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + uint32_t QueueGroupOrdinal, QueueIndex; + auto NextIndex = QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, + /*QueryOnly */ true); + auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; + IncludeLastCommandEvent &= + CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && + NextImmCmdList != CurQueue->LastUsedCommandList; + } else { + IncludeLastCommandEvent = false; + } + } + try { uint32_t TmpListLength = 0; @@ -1205,6 +1225,25 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList = new ur_event_handle_t[EventListLength]; } + auto WaitListEmptyOrAllEventsFromSameQueue = [CurQueue, EventListLength, + EventList]() { + if (!EventListLength) + return true; + for (uint32_t i = 0; i < EventListLength; ++i) { + if (CurQueue != EventList[i]->UrQueue) + return false; + } + return true; + }; + + // For in-order queue and wait-list which is empty or has events only from + // the same queue then we don't need to wait on any other additional events + if (CurQueue->Device->useDriverInOrderLists() && + CurQueue->isInOrderQueue() && WaitListEmptyOrAllEventsFromSameQueue()) { + this->Length = TmpListLength; + return UR_RESULT_SUCCESS; + } + if (EventListLength > 0) { for (uint32_t I = 0; I < EventListLength; I++) { { diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 0e5ce3215a..c40e4ef0e3 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -214,8 +214,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // the code can do a urKernelRelease on this kernel. (*Event)->CommandData = (void *)Kernel; - // Increment the reference count of the Kernel and indicate that the Kernel is - // in use. Once the event has been signalled, the code in + // Increment the reference count of the Kernel and indicate that the Kernel + // is in use. Once the event has been signalled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. UR_CALL(urKernelRetain(Kernel)); diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 2009c3c6f5..832feb628c 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1870,6 +1870,10 @@ ur_result_t ur_queue_handle_t_::createCommandList( ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + if (Device->useDriverInOrderLists() && isInOrderQueue()) { + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + } + ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); @@ -1983,8 +1987,10 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { Priority = "High"; } - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { + if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + } else if (QueueIndex != 0) { + // Evaluate performance of explicit usage for "0" index. ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; }