From d0a2e285ed4563033a1b239667bc0f31135fc39d Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 9 Nov 2023 12:07:08 -0800 Subject: [PATCH] [UR][L0] Add support for in-order lists Resolves: #941 Signed-off-by: Jaime Arteaga --- source/adapters/level_zero/command_buffer.cpp | 2 +- source/adapters/level_zero/common.hpp | 9 +++ source/adapters/level_zero/event.cpp | 64 ++++++++++++++++--- source/adapters/level_zero/event.hpp | 9 +-- source/adapters/level_zero/kernel.cpp | 33 +++++++++- source/adapters/level_zero/memory.cpp | 64 +++++++++++++++---- source/adapters/level_zero/queue.cpp | 25 +++++++- 7 files changed, 177 insertions(+), 29 deletions(-) mode change 100755 => 100644 source/adapters/level_zero/queue.cpp diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7ba3cfae4d..2e886dff7a 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -725,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false, false)) diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index 7c2ac7f8be..31b90c0e4a 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -426,6 +426,15 @@ const bool ExposeCSliceInAffinityPartitioning = [] { return Flag ? std::atoi(Flag) != 0 : false; }(); +// Use in-order lists implementation from L0 driver instead +// of adapter's implementation. +const bool UseDriverInOrderLists = [] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); + if (!UrRet) + return false; + return std::atoi(UrRet) != 0; +}(); + // TODO: make it into a ur_device_handle_t class member const std::pair getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device); diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index b979c8ab15..5e1f744de9 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -63,7 +63,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( _ur_ze_event_list_t TmpWaitList = {}; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; @@ -235,7 +235,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Retain the events as they will be owned by the result event. _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/)); + NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/, + nullptr)); // Get an arbitrary command-list in the queue. ur_command_list_ptr_t CmdList; @@ -333,7 +334,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( UR_CALL(BaseWaitList.createAndRetainUrZeEventList( EventWaitVector.size(), reinterpret_cast(EventWaitVector.data()), - Queue, ConvergenceCmdList->second.isCopy(Queue))); + Queue, ConvergenceCmdList->second.isCopy(Queue), nullptr)); // Insert a barrier with the events from each command-queue into the // convergence command list. The resulting event signals the convergence of @@ -1039,12 +1040,18 @@ ur_result_t ur_event_handle_t_::reset() { ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( uint32_t EventListLength, const ur_event_handle_t *EventList, - ur_queue_handle_t CurQueue, bool UseCopyEngine) { + ur_queue_handle_t CurQueue, bool UseCopyEngine, + ur_event_handle_t *LastCommandEventIncluded) { this->Length = 0; this->ZeEventList = nullptr; this->UrEventList = nullptr; if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + +#if 0 + printf("%s %d CurQueue->LastCommandEvent 0x%lx\n", __FILE__, __LINE__, + (unsigned long int)CurQueue->LastCommandEvent->ZeEvent); +#endif if (CurQueue->UsingImmCmdLists) { if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) { // If queue is in-order with discarded events and if @@ -1086,6 +1093,32 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( bool IncludeLastCommandEvent = CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; +#if 0 + if (UseDriverInOrderLists) { + IncludeLastCommandEvent &= + CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && + CommandList->first != CurQueue->LastUsedCommandList->first; + } +#endif + +#if 0 + if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandEvent) { + printf("%s %d UseDriverInOrderLists %d CommandList->first 0x%lx CurQueue->LastUsedCommandList->first 0x%lx CurQueue->LastCommandEvent->ZeEvent 0x%lx\n", + __FILE__, + __LINE__, + UseDriverInOrderLists, + (unsigned long int)CommandList->first, + (unsigned long int)CurQueue->LastUsedCommandList->first, + (unsigned long int)CurQueue->LastCommandEvent->ZeEvent + ); + } + + printf("%s %d IncludeLastCommandEvent %d\n", + __FILE__, + __LINE__, + IncludeLastCommandEvent); +#endif + // If the last event is discarded then we already have a barrier waiting for // that event, so must not include the last command event into the wait // list because it will cause waiting for event which was reset. @@ -1097,12 +1130,27 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( uint32_t TmpListLength = 0; if (IncludeLastCommandEvent) { +#if 0 + printf("JAIME %s %d CurQueue->isInOrderQueue() %d, IncludeLastCommandEvent %d CurQueue 0x%lx LastCommandEvent 0x%lx\n", __FILE__, __LINE__, + CurQueue->isInOrderQueue(), + IncludeLastCommandEvent, + (unsigned long int)CurQueue, + (unsigned long int)CurQueue->LastCommandEvent); +#endif + this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->UrEventList = new ur_event_handle_t[EventListLength + 1]; - std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); - this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent; - this->UrEventList[0] = CurQueue->LastCommandEvent; - this->UrEventList[0]->RefCount.increment(); + if (LastCommandEventIncluded) { + *LastCommandEventIncluded = CurQueue->LastCommandEvent; + // printf("JAIME %s %d IncludeLastCommandEvent %d\n", __FILE__, + // __LINE__, IncludeLastCommandEvent); + } else { + std::shared_lock Lock( + CurQueue->LastCommandEvent->Mutex); + this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent; + this->UrEventList[0] = CurQueue->LastCommandEvent; + this->UrEventList[0]->RefCount.increment(); + } TmpListLength = 1; } else if (EventListLength > 0) { this->ZeEventList = new ze_event_handle_t[EventListLength]; diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index d4e975012c..2144da32ef 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -84,10 +84,11 @@ struct _ur_ze_event_list_t { // event wait-list is for) is going to go to copy or compute // queue. This is used to properly submit the dependent open // command-lists. - ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength, - const ur_event_handle_t *EventList, - ur_queue_handle_t CurQueue, - bool UseCopyEngine); + ur_result_t + createAndRetainUrZeEventList(uint32_t EventListLength, + const ur_event_handle_t *EventList, + ur_queue_handle_t CurQueue, bool UseCopyEngine, + ur_event_handle_t *LastCommandEventIncluded); // Add all the events in this object's UrEventList to the end // of the list EventsToBeReleased. Destroy ur_ze_event_list_t data diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index dfa8915197..bcb75cbf4b 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -177,16 +177,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + bool UseCopyEngine = false; + ur_event_handle_t LastCommandEventIncluded = nullptr; _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, + &LastCommandEventIncluded)); - // Get a new command list to be used on this call - ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( Queue, CommandList, UseCopyEngine, true /* AllowBatching */)); + bool IncludeLastCommandEvent = (LastCommandEventIncluded != nullptr); + if (UseDriverInOrderLists) { + IncludeLastCommandEvent &= + Queue->LastUsedCommandList != Queue->CommandListMap.end() && + CommandList->first != Queue->LastUsedCommandList->first; + } + + // printf("JAIME %s %d LastCommandEventIncluded 0x%lx TmpWaitList.Length %d + // Queue->LastCommandEvent %lx\n", + // __FILE__, __LINE__, + // (unsigned long int)LastCommandEventIncluded, + // TmpWaitList.Length, + // (unsigned long int)Queue->LastCommandEvent); + + if (IncludeLastCommandEvent) { + std::shared_lock Lock(LastCommandEventIncluded->Mutex); + TmpWaitList.ZeEventList[TmpWaitList.Length - 1] = + LastCommandEventIncluded->ZeEvent; + TmpWaitList.UrEventList[TmpWaitList.Length - 1] = LastCommandEventIncluded; + TmpWaitList.UrEventList[TmpWaitList.Length - 1]->RefCount.increment(); + } else if (TmpWaitList.Length > 0) { + TmpWaitList.Length--; + } + ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent{}; bool IsInternal = OutEvent == nullptr; diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index aefa661dac..f0100c4ca0 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -50,18 +50,59 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, bool PreferCopyEngine) { bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + ur_command_list_ptr_t CommandList{}; + ur_event_handle_t LastCommandEventIncluded = nullptr; _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, + &LastCommandEventIncluded)); - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; + // printf("JAIME %s %d Queue 0x%lx LastCommandEvent 0x%lx\n", + // __FILE__, __LINE__, + // (unsigned long int)Queue, + // (unsigned long int)Queue->LastCommandEvent); // Get a new command list to be used on this call - ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, UseCopyEngine, OkToBatch)); + // printf("JAIME %s %d Queue 0x%lx LastCommandEvent 0x%lx\n", + // __FILE__, __LINE__, + // (unsigned long int)Queue, + // (unsigned long int)Queue->LastCommandEvent); + + // printf("JAIME %s %d Queue %lx LastCommandEvent %lx\n", + // __FILE__, __LINE__, + // (unsigned long int)Queue, + // (unsigned long int)Queue->LastCommandEvent); + + bool IncludeLastCommandEvent = (LastCommandEventIncluded != nullptr); + if (UseDriverInOrderLists) { + IncludeLastCommandEvent &= + Queue->LastUsedCommandList != Queue->CommandListMap.end() && + CommandList->first != Queue->LastUsedCommandList->first; + } + + // printf("JAIME %s %d LastCommandEventIncluded 0x%lx TmpWaitList.Length %d + // Queue %lx LastCommandEvent %lx\n", + // __FILE__, __LINE__, + // (unsigned long int)LastCommandEventIncluded, + // TmpWaitList.Length, + // (unsigned long int)Queue, + // (unsigned long int)Queue->LastCommandEvent); + + if (IncludeLastCommandEvent) { + std::shared_lock Lock(LastCommandEventIncluded->Mutex); + TmpWaitList.ZeEventList[TmpWaitList.Length - 1] = + LastCommandEventIncluded->ZeEvent; + TmpWaitList.UrEventList[TmpWaitList.Length - 1] = LastCommandEventIncluded; + TmpWaitList.UrEventList[TmpWaitList.Length - 1]->RefCount.increment(); + } else if (TmpWaitList.Length > 0) { + TmpWaitList.Length--; + } + ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; bool IsInternal = OutEvent == nullptr; @@ -102,7 +143,7 @@ ur_result_t enqueueMemCopyRectHelper( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; @@ -214,7 +255,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); ur_command_list_ptr_t CommandList{}; // We want to batch these commands to avoid extra submissions (costly) @@ -346,7 +387,7 @@ static ur_result_t enqueueMemImageCommandHelper( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; @@ -909,7 +950,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); UR_CALL( createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, @@ -931,7 +972,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } else if (MapFlags & UR_MAP_FLAG_WRITE) AccessMode = ur_mem_handle_t_::write_only; } - UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown, UR_RESULT_ERROR_INVALID_VALUE); @@ -1067,7 +1107,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP, Queue->CommandListMap.end(), @@ -1247,7 +1287,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; @@ -1302,7 +1342,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue, - UseCopyEngine)); + UseCopyEngine, nullptr)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp old mode 100755 new mode 100644 index 994f595a5d..d69dfebcb3 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -650,9 +650,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( ur_queue_handle_t UrQueue ///< [in] handle of the queue to be finished. ) { + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); if (UrQueue->UsingImmCmdLists) { // Lock automatically releases when this goes out of scope. std::scoped_lock Lock(UrQueue->Mutex); + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); UrQueue->synchronize(); } else { @@ -1398,20 +1400,29 @@ ur_result_t CleanupEventListFromResetCmdList( // runtime. Need to investigate whether relase can be done earlier, at sync // points such as this, to reduce total number of active Events. ur_result_t ur_queue_handle_t_::synchronize() { + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); if (!Healthy) return UR_RESULT_SUCCESS; + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); + auto syncImmCmdList = [](ur_queue_handle_t_ *Queue, ur_command_list_ptr_t ImmCmdList) { + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); if (ImmCmdList == Queue->CommandListMap.end()) return UR_RESULT_SUCCESS; + // fprintf(stderr, "JAIME %s %d ImmCmdList->first 0x%lx\n", __FILE__, + // __LINE__, (unsigned long int)ImmCmdList->first); // wait for all commands previously submitted to this immediate command list ZE2UR_CALL(zeCommandListHostSynchronize, (ImmCmdList->first, UINT64_MAX)); + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); // Cleanup all events from the synced command list. CleanupEventListFromResetCmdList(ImmCmdList->second.EventList, true); + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); ImmCmdList->second.EventList.clear(); + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); return UR_RESULT_SUCCESS; }; @@ -1421,10 +1432,12 @@ ur_result_t ur_queue_handle_t_::synchronize() { // zero handle can have device scope, so we can't synchronize the last // event. if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); // clean up all events known to have been completed as well, // so they can be reused later + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { for (auto &QueueGroup : QueueMap) { if (UsingImmCmdLists) { @@ -1439,7 +1452,9 @@ ur_result_t ur_queue_handle_t_::synchronize() { } } } + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); } else { + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); // Otherwise sync all L0 queues/immediate command-lists. for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { for (auto &QueueGroup : QueueMap) { @@ -1453,14 +1468,17 @@ ur_result_t ur_queue_handle_t_::synchronize() { } } } + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); } LastCommandEvent = nullptr; } + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); // With the entire queue synchronized, the active barriers must be done so we // can remove them. if (auto Res = ActiveBarriers.clear()) return Res; + // fprintf(stderr, "JAIME %s %d\n", __FILE__, __LINE__); return UR_RESULT_SUCCESS; } @@ -1885,7 +1903,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, _ur_ze_event_list_t ActiveBarriersWaitList; UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList( ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), - reinterpret_cast(this), UseCopyEngine)); + reinterpret_cast(this), UseCopyEngine, nullptr)); // We can now replace active barriers with the ones in the wait list. UR_CALL(ActiveBarriers.clear()); @@ -2004,6 +2022,11 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { } } + if (UseDriverInOrderLists && Queue->isInOrderQueue()) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + urPrint("Using in-order driver implementation\n"); + } + // If cache didn't contain a command list, create one. if (!ZeCommandList) { urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "