Skip to content

Commit

Permalink
[UR][L0] Add support for in-order lists (2/N)
Browse files Browse the repository at this point in the history
Avoid events in appendLaunchKernels for in-order lists

Resolves: #941

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
  • Loading branch information
Jaime Arteaga committed Nov 23, 2023
1 parent 6100c83 commit 41bd7ee
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 44 deletions.
2 changes: 1 addition & 1 deletion source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
if (NumEventsInWaitList) {
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false))
Expand Down
10 changes: 6 additions & 4 deletions source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,10 +400,12 @@ ur_result_t ur_context_handle_t_::finalize() {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
for (auto &EventCache : EventCaches) {
for (auto &Event : EventCache) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (Event->ZeEvent) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
delete Event;
}
EventCache.clear();
Expand Down
17 changes: 12 additions & 5 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(

_ur_ze_event_list_t TmpWaitList = {};
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -235,7 +235,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// Retain the events as they will be owned by the result event.
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/,
nullptr));

// Get an arbitrary command-list in the queue.
ur_command_list_ptr_t CmdList;
Expand Down Expand Up @@ -333,7 +334,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
EventWaitVector.size(),
reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
Queue, ConvergenceCmdList->second.isCopy(Queue)));
Queue, ConvergenceCmdList->second.isCopy(Queue), nullptr));

// Insert a barrier with the events from each command-queue into the
// convergence command list. The resulting event signals the convergence of
Expand Down Expand Up @@ -1033,13 +1034,16 @@ ur_result_t ur_event_handle_t_::reset() {
if (!isHostVisible())
HostVisibleEvent = nullptr;

ZE2UR_CALL(zeEventHostReset, (ZeEvent));
if (ZeEvent) {
ZE2UR_CALL(zeEventHostReset, (ZeEvent));
}
return UR_RESULT_SUCCESS;
}

ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
uint32_t EventListLength, const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue, bool UseCopyEngine) {
ur_queue_handle_t CurQueue, bool UseCopyEngine,
bool *LastCommandEventIncluded) {
this->Length = 0;
this->ZeEventList = nullptr;
this->UrEventList = nullptr;
Expand Down Expand Up @@ -1099,6 +1103,9 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
if (IncludeLastCommandEvent) {
this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
this->UrEventList = new ur_event_handle_t[EventListLength + 1];
if (LastCommandEventIncluded) {
*LastCommandEventIncluded = true;
}
std::shared_lock<ur_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
this->UrEventList[0] = CurQueue->LastCommandEvent;
Expand Down
3 changes: 2 additions & 1 deletion source/adapters/level_zero/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ struct _ur_ze_event_list_t {
ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength,
const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue,
bool UseCopyEngine);
bool UseCopyEngine,
bool *LastCommandEventIncluded);

// Add all the events in this object's UrEventList to the end
// of the list EventsToBeReleased. Destroy ur_ze_event_list_t data
Expand Down
74 changes: 52 additions & 22 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,34 +178,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));

bool UseCopyEngine = false;
bool LastCommandEventIncluded = false;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine,
&LastCommandEventIncluded));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, true /* AllowBatching */));

uint32_t WaitListLength = TmpWaitList.Length;
uint32_t WaitEventOffset = 0;

if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) {
LastCommandEventIncluded &=
Queue->LastUsedCommandList != Queue->CommandListMap.end() &&
CommandList->first != Queue->LastUsedCommandList->first;

if (!LastCommandEventIncluded && WaitListLength > 0) {
WaitListLength--;
WaitEventOffset = 1;
if (Queue->LastCommandEvent && Queue->LastCommandEvent->ZeEvent) {
ZE2UR_CALL(zeEventDestroy, (Queue->LastCommandEvent->ZeEvent));
Queue->LastCommandEvent->ZeEvent = nullptr;
}
}
}

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
ur_event_handle_t *Event = nullptr;

bool SkipOutputEventCreate =
(Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() &&
WaitListLength == 0 && IsInternal);

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;
if (!SkipOutputEventCreate) {
Event = OutEvent ? OutEvent : &InternalEvent;

// Save the kernel in the event, so that when the event is signalled
// the code can do a urKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;
UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;

// Increment the reference count of the Kernel and indicate that the Kernel is
// in use. Once the event has been signalled, the code in
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
// reference count on the kernel, using the kernel saved in CommandData.
UR_CALL(urKernelRetain(Kernel));
// Save the kernel in the event, so that when the event is signalled
// the code can do a urKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;

// Increment the reference count of the Kernel and indicate that the Kernel
// is in use. Once the event has been signalled, the code in
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
// reference count on the kernel, using the kernel saved in CommandData.
UR_CALL(urKernelRetain(Kernel));
}

// Add to list of kernels to be submitted
if (IndirectAccessTrackingEnabled)
Expand All @@ -228,23 +256,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
// Add the command to the command list, which implies submission.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
ZeEvent, (*Event)->WaitList.Length,
(*Event)->WaitList.ZeEventList));
ZeEvent, WaitListLength,
&TmpWaitList.ZeEventList[WaitEventOffset]));
} else {
// Add the command to the command list for later submission.
// No lock is needed here, unlike the immediate commandlist case above,
// because the kernels are not actually submitted yet. Kernels will be
// submitted only when the comamndlist is closed. Then, a lock is held.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
ZeEvent, (*Event)->WaitList.Length,
(*Event)->WaitList.ZeEventList));
ZeEvent, WaitListLength,
&TmpWaitList.ZeEventList[WaitEventOffset]));
}

urPrint("calling zeCommandListAppendLaunchKernel() with"
" ZeEvent %#" PRIxPTR "\n",
ur_cast<std::uintptr_t>(ZeEvent));
printZeEventList((*Event)->WaitList);
if (Event) {
urPrint("calling zeCommandListAppendLaunchKernel() with"
" ZeEvent %#" PRIxPTR "\n",
ur_cast<std::uintptr_t>(ZeEvent));
printZeEventList((*Event)->WaitList);
}

// Execute command list asynchronously, as the event will be used
// to track down its completion.
Expand Down
17 changes: 8 additions & 9 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -102,7 +102,7 @@ ur_result_t enqueueMemCopyRectHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -214,7 +214,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

ur_command_list_ptr_t CommandList{};
// We want to batch these commands to avoid extra submissions (costly)
Expand Down Expand Up @@ -346,7 +346,7 @@ static ur_result_t enqueueMemImageCommandHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -909,7 +909,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(
createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP,
Expand All @@ -931,7 +931,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
} else if (MapFlags & UR_MAP_FLAG_WRITE)
AccessMode = ur_mem_handle_t_::write_only;
}

UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown,
UR_RESULT_ERROR_INVALID_VALUE);

Expand Down Expand Up @@ -1067,7 +1066,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
Queue->CommandListMap.end(),
Expand Down Expand Up @@ -1247,7 +1246,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
//
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -1302,7 +1301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue,
UseCopyEngine));
UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down
6 changes: 4 additions & 2 deletions source/adapters/level_zero/queue.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -1421,7 +1421,9 @@ ur_result_t ur_queue_handle_t_::synchronize() {
// zero handle can have device scope, so we can't synchronize the last
// event.
if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
if (LastCommandEvent->ZeEvent) {
ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
}

// clean up all events known to have been completed as well,
// so they can be reused later
Expand Down Expand Up @@ -1885,7 +1887,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList,
_ur_ze_event_list_t ActiveBarriersWaitList;
UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList(
ActiveBarriers.vector().size(), ActiveBarriers.vector().data(),
reinterpret_cast<ur_queue_handle_t>(this), UseCopyEngine));
reinterpret_cast<ur_queue_handle_t>(this), UseCopyEngine, nullptr));

// We can now replace active barriers with the ones in the wait list.
UR_CALL(ActiveBarriers.clear());
Expand Down

0 comments on commit 41bd7ee

Please sign in to comment.