Skip to content

Commit

Permalink
[L0] Add support for in-order lists using L0 driver
Browse files Browse the repository at this point in the history
Signed-off-by: Raiyan Latif <raiyan.latif@intel.com>
  • Loading branch information
raiyanla committed Feb 29, 2024
1 parent 91c6068 commit 9b3bf62
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 44 deletions.
2 changes: 1 addition & 1 deletion source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
if (NumEventsInWaitList) {
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Update the WaitList of the Wait Event
// Events are appended to the WaitList if the WaitList is not empty
Expand Down
10 changes: 6 additions & 4 deletions source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,10 +397,12 @@ ur_result_t ur_context_handle_t_::finalize() {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
for (auto EventCache : EventCaches) {
for (auto Event : *EventCache) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (Event->ZeEvent) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
delete Event;
}
EventCache->clear();
Expand Down
18 changes: 18 additions & 0 deletions source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,24 @@ bool ur_device_handle_t_::useRelaxedAllocationLimits() {
return EnableRelaxedAllocationLimits;
}

bool ur_device_handle_t_::useDriverInOrderLists() {
// Use in-order lists implementation from L0 driver instead
// of adapter's implementation.
static const bool UseDriverInOrderLists = [this] {
return true;
#if 0
const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
if (!UrRet)
return false;
if (this->useImmediateCommandLists() == 0)
return false;
return std::atoi(UrRet) != 0;
#endif
}();

return UseDriverInOrderLists;
}

ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
int SubSubDeviceIndex) {
// Maintain various device properties cache.
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/level_zero/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ struct ur_device_handle_t_ : _ur_object {
// Read env settings to select immediate commandlist mode.
ImmCmdlistMode useImmediateCommandLists();

// Whether Adapter uses driver's implementation of in-order lists or not
bool useDriverInOrderLists();

// Returns whether immediate command lists are used on this device.
ImmCmdlistMode ImmCommandListUsed{};

Expand Down
17 changes: 12 additions & 5 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(

_ur_ze_event_list_t TmpWaitList = {};
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -244,7 +244,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// Retain the events as they will be owned by the result event.
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/,
nullptr));

// Get an arbitrary command-list in the queue.
ur_command_list_ptr_t CmdList;
Expand Down Expand Up @@ -342,7 +343,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
EventWaitVector.size(),
reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
Queue, ConvergenceCmdList->second.isCopy(Queue)));
Queue, ConvergenceCmdList->second.isCopy(Queue), nullptr));

// Insert a barrier with the events from each command-queue into the
// convergence command list. The resulting event signals the convergence of
Expand Down Expand Up @@ -1129,13 +1130,16 @@ ur_result_t ur_event_handle_t_::reset() {
if (!isHostVisible())
HostVisibleEvent = nullptr;

ZE2UR_CALL(zeEventHostReset, (ZeEvent));
if (ZeEvent) {
ZE2UR_CALL(zeEventHostReset, (ZeEvent));
}
return UR_RESULT_SUCCESS;
}

ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
uint32_t EventListLength, const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue, bool UseCopyEngine) {
ur_queue_handle_t CurQueue, bool UseCopyEngine,
bool *LastCommandEventIncluded) {
this->Length = 0;
this->ZeEventList = nullptr;
this->UrEventList = nullptr;
Expand Down Expand Up @@ -1195,6 +1199,9 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
if (IncludeLastCommandEvent) {
this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
this->UrEventList = new ur_event_handle_t[EventListLength + 1];
if (LastCommandEventIncluded) {
*LastCommandEventIncluded = true;
}
std::shared_lock<ur_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
this->UrEventList[0] = CurQueue->LastCommandEvent;
Expand Down
3 changes: 2 additions & 1 deletion source/adapters/level_zero/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ struct _ur_ze_event_list_t {
ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength,
const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue,
bool UseCopyEngine);
bool UseCopyEngine,
bool *LastCommandEventIncluded);

// Add all the events in this object's UrEventList to the end
// of the list EventsToBeReleased. Destroy ur_ze_event_list_t data
Expand Down
70 changes: 50 additions & 20 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,34 +191,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2]));

bool UseCopyEngine = false;
bool LastCommandEventIncluded = false;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine,
&LastCommandEventIncluded));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, true /* AllowBatching */));

uint32_t WaitListLength = TmpWaitList.Length;
uint32_t WaitEventOffset = 0;

if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) {
LastCommandEventIncluded &=
Queue->LastUsedCommandList != Queue->CommandListMap.end() &&
CommandList->first != Queue->LastUsedCommandList->first;

if (!LastCommandEventIncluded && WaitListLength > 0) {
WaitListLength--;
WaitEventOffset = 1;
if (Queue->LastCommandEvent && Queue->LastCommandEvent->ZeEvent) {
ZE2UR_CALL(zeEventDestroy, (Queue->LastCommandEvent->ZeEvent));
Queue->LastCommandEvent->ZeEvent = nullptr;
}
}
}

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
ur_event_handle_t *Event = nullptr;

bool SkipOutputEventCreate =
(Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() &&
WaitListLength == 0 && IsInternal);

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal, false));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;
if (!SkipOutputEventCreate) {
Event = OutEvent ? OutEvent : &InternalEvent;

// Save the kernel in the event, so that when the event is signalled
// the code can do a urKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;
UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal, false));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;

// Increment the reference count of the Kernel and indicate that the Kernel is
// in use. Once the event has been signalled, the code in
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
// reference count on the kernel, using the kernel saved in CommandData.
UR_CALL(urKernelRetain(Kernel));
// Save the kernel in the event, so that when the event is signalled
// the code can do a urKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;

// Increment the reference count of the Kernel and indicate that the Kernel
// is in use. Once the event has been signalled, the code in
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
// reference count on the kernel, using the kernel saved in CommandData.
UR_CALL(urKernelRetain(Kernel));
}

// Add to list of kernels to be submitted
if (IndirectAccessTrackingEnabled)
Expand All @@ -241,21 +269,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
// Add the command to the command list, which implies submission.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
WaitListLength, &TmpWaitList.ZeEventList[WaitEventOffset]));
} else {
// Add the command to the command list for later submission.
// No lock is needed here, unlike the immediate commandlist case above,
// because the kernels are not actually submitted yet. Kernels will be
// submitted only when the comamndlist is closed. Then, a lock is held.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
WaitListLength, &TmpWaitList.ZeEventList[WaitEventOffset]));
}

urPrint("calling zeCommandListAppendLaunchKernel() with"
" ZeEvent %#" PRIxPTR "\n",
ur_cast<std::uintptr_t>(ZeEvent));
printZeEventList((*Event)->WaitList);
if (Event) {
urPrint("calling zeCommandListAppendLaunchKernel() with"
" ZeEvent %#" PRIxPTR "\n",
ur_cast<std::uintptr_t>(ZeEvent));
printZeEventList((*Event)->WaitList);
}

// Execute command list asynchronously, as the event will be used
// to track down its completion.
Expand Down
17 changes: 8 additions & 9 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -102,7 +102,7 @@ ur_result_t enqueueMemCopyRectHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -214,7 +214,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

ur_command_list_ptr_t CommandList{};
// We want to batch these commands to avoid extra submissions (costly)
Expand Down Expand Up @@ -346,7 +346,7 @@ static ur_result_t enqueueMemImageCommandHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -909,7 +909,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(createEventAndAssociateQueue(
Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(),
Expand All @@ -931,7 +931,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
} else if (MapFlags & UR_MAP_FLAG_WRITE)
AccessMode = ur_mem_handle_t_::write_only;
}

UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown,
UR_RESULT_ERROR_INVALID_VALUE);

Expand Down Expand Up @@ -1067,7 +1066,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
Queue->CommandListMap.end(),
Expand Down Expand Up @@ -1247,7 +1246,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
//
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -1302,7 +1301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue,
UseCopyEngine));
UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down
15 changes: 11 additions & 4 deletions source/adapters/level_zero/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1427,7 +1427,9 @@ ur_result_t ur_queue_handle_t_::synchronize() {
// zero handle can have device scope, so we can't synchronize the last
// event.
if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
if (LastCommandEvent->ZeEvent) {
ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
}

// clean up all events known to have been completed as well,
// so they can be reused later
Expand Down Expand Up @@ -1892,6 +1894,8 @@ ur_result_t ur_queue_handle_t_::createCommandList(

ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
if (Device->useDriverInOrderLists() && isInOrderQueue())
ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER;

ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
&ZeCommandListDesc, &ZeCommandList));
Expand Down Expand Up @@ -1919,7 +1923,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList,
_ur_ze_event_list_t ActiveBarriersWaitList;
UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList(
ActiveBarriers.vector().size(), ActiveBarriers.vector().data(),
reinterpret_cast<ur_queue_handle_t>(this), UseCopyEngine));
reinterpret_cast<ur_queue_handle_t>(this), UseCopyEngine, nullptr));

// We can now replace active barriers with the ones in the wait list.
UR_CALL(ActiveBarriers.clear());
Expand Down Expand Up @@ -2006,8 +2010,11 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() {
Priority = "High";
}

// Evaluate performance of explicit usage for "0" index.
if (QueueIndex != 0) {
if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) {
ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
urPrint("Using in-order driver implementation\n");
} else if (QueueIndex != 0) {
// Evaluate performance of explicit usage for "0" index.
ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
}

Expand Down

0 comments on commit 9b3bf62

Please sign in to comment.