Skip to content

Commit

Permalink
[UR][L0] Add support for in-order lists
Browse files Browse the repository at this point in the history
Resolves: #941

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
  • Loading branch information
Jaime Arteaga committed Nov 15, 2023
1 parent 1e11369 commit d0a2e28
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 29 deletions.
2 changes: 1 addition & 1 deletion source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
if (NumEventsInWaitList) {
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false))
Expand Down
9 changes: 9 additions & 0 deletions source/adapters/level_zero/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,15 @@ const bool ExposeCSliceInAffinityPartitioning = [] {
return Flag ? std::atoi(Flag) != 0 : false;
}();

// Use in-order lists implementation from L0 driver instead
// of adapter's implementation.
const bool UseDriverInOrderLists = [] {
const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
if (!UrRet)
return false;
return std::atoi(UrRet) != 0;
}();

// TODO: make it into a ur_device_handle_t class member
const std::pair<int, int>
getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device);
Expand Down
64 changes: 56 additions & 8 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(

_ur_ze_event_list_t TmpWaitList = {};
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -235,7 +235,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// Retain the events as they will be owned by the result event.
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/,
nullptr));

// Get an arbitrary command-list in the queue.
ur_command_list_ptr_t CmdList;
Expand Down Expand Up @@ -333,7 +334,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
EventWaitVector.size(),
reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
Queue, ConvergenceCmdList->second.isCopy(Queue)));
Queue, ConvergenceCmdList->second.isCopy(Queue), nullptr));

// Insert a barrier with the events from each command-queue into the
// convergence command list. The resulting event signals the convergence of
Expand Down Expand Up @@ -1039,12 +1040,18 @@ ur_result_t ur_event_handle_t_::reset() {

ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
uint32_t EventListLength, const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue, bool UseCopyEngine) {
ur_queue_handle_t CurQueue, bool UseCopyEngine,
ur_event_handle_t *LastCommandEventIncluded) {
this->Length = 0;
this->ZeEventList = nullptr;
this->UrEventList = nullptr;

if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {

#if 0
printf("%s %d CurQueue->LastCommandEvent 0x%lx\n", __FILE__, __LINE__,
(unsigned long int)CurQueue->LastCommandEvent->ZeEvent);
#endif
if (CurQueue->UsingImmCmdLists) {
if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) {
// If queue is in-order with discarded events and if
Expand Down Expand Up @@ -1086,6 +1093,32 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
bool IncludeLastCommandEvent =
CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr;

#if 0
if (UseDriverInOrderLists) {
IncludeLastCommandEvent &=
CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() &&
CommandList->first != CurQueue->LastUsedCommandList->first;
}
#endif

#if 0
if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandEvent) {
printf("%s %d UseDriverInOrderLists %d CommandList->first 0x%lx CurQueue->LastUsedCommandList->first 0x%lx CurQueue->LastCommandEvent->ZeEvent 0x%lx\n",
__FILE__,
__LINE__,
UseDriverInOrderLists,
(unsigned long int)CommandList->first,
(unsigned long int)CurQueue->LastUsedCommandList->first,
(unsigned long int)CurQueue->LastCommandEvent->ZeEvent
);
}

printf("%s %d IncludeLastCommandEvent %d\n",
__FILE__,
__LINE__,
IncludeLastCommandEvent);
#endif

// If the last event is discarded then we already have a barrier waiting for
// that event, so must not include the last command event into the wait
// list because it will cause waiting for event which was reset.
Expand All @@ -1097,12 +1130,27 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
uint32_t TmpListLength = 0;

if (IncludeLastCommandEvent) {
#if 0
printf("JAIME %s %d CurQueue->isInOrderQueue() %d, IncludeLastCommandEvent %d CurQueue 0x%lx LastCommandEvent 0x%lx\n", __FILE__, __LINE__,
CurQueue->isInOrderQueue(),
IncludeLastCommandEvent,
(unsigned long int)CurQueue,
(unsigned long int)CurQueue->LastCommandEvent);
#endif

this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
this->UrEventList = new ur_event_handle_t[EventListLength + 1];
std::shared_lock<ur_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
this->UrEventList[0] = CurQueue->LastCommandEvent;
this->UrEventList[0]->RefCount.increment();
if (LastCommandEventIncluded) {
*LastCommandEventIncluded = CurQueue->LastCommandEvent;
// printf("JAIME %s %d IncludeLastCommandEvent %d\n", __FILE__,
// __LINE__, IncludeLastCommandEvent);
} else {
std::shared_lock<ur_shared_mutex> Lock(
CurQueue->LastCommandEvent->Mutex);
this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
this->UrEventList[0] = CurQueue->LastCommandEvent;
this->UrEventList[0]->RefCount.increment();
}
TmpListLength = 1;
} else if (EventListLength > 0) {
this->ZeEventList = new ze_event_handle_t[EventListLength];
Expand Down
9 changes: 5 additions & 4 deletions source/adapters/level_zero/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ struct _ur_ze_event_list_t {
// event wait-list is for) is going to go to copy or compute
// queue. This is used to properly submit the dependent open
// command-lists.
ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength,
const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue,
bool UseCopyEngine);
ur_result_t
createAndRetainUrZeEventList(uint32_t EventListLength,
const ur_event_handle_t *EventList,
ur_queue_handle_t CurQueue, bool UseCopyEngine,
ur_event_handle_t *LastCommandEventIncluded);

// Add all the events in this object's UrEventList to the end
// of the list EventsToBeReleased. Destroy ur_ze_event_list_t data
Expand Down
33 changes: 30 additions & 3 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};

bool UseCopyEngine = false;
ur_event_handle_t LastCommandEventIncluded = nullptr;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine,
&LastCommandEventIncluded));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, true /* AllowBatching */));

bool IncludeLastCommandEvent = (LastCommandEventIncluded != nullptr);
if (UseDriverInOrderLists) {
IncludeLastCommandEvent &=
Queue->LastUsedCommandList != Queue->CommandListMap.end() &&
CommandList->first != Queue->LastUsedCommandList->first;
}

// printf("JAIME %s %d LastCommandEventIncluded 0x%lx TmpWaitList.Length %d
// Queue->LastCommandEvent %lx\n",
// __FILE__, __LINE__,
// (unsigned long int)LastCommandEventIncluded,
// TmpWaitList.Length,
// (unsigned long int)Queue->LastCommandEvent);

if (IncludeLastCommandEvent) {
std::shared_lock<ur_shared_mutex> Lock(LastCommandEventIncluded->Mutex);
TmpWaitList.ZeEventList[TmpWaitList.Length - 1] =
LastCommandEventIncluded->ZeEvent;
TmpWaitList.UrEventList[TmpWaitList.Length - 1] = LastCommandEventIncluded;
TmpWaitList.UrEventList[TmpWaitList.Length - 1]->RefCount.increment();
} else if (TmpWaitList.Length > 0) {
TmpWaitList.Length--;
}

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
Expand Down
64 changes: 52 additions & 12 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,59 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
bool PreferCopyEngine) {
bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
ur_command_list_ptr_t CommandList{};
ur_event_handle_t LastCommandEventIncluded = nullptr;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine,
&LastCommandEventIncluded));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
// printf("JAIME %s %d Queue 0x%lx LastCommandEvent 0x%lx\n",
// __FILE__, __LINE__,
// (unsigned long int)Queue,
// (unsigned long int)Queue->LastCommandEvent);

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
UseCopyEngine, OkToBatch));

// printf("JAIME %s %d Queue 0x%lx LastCommandEvent 0x%lx\n",
// __FILE__, __LINE__,
// (unsigned long int)Queue,
// (unsigned long int)Queue->LastCommandEvent);

// printf("JAIME %s %d Queue %lx LastCommandEvent %lx\n",
// __FILE__, __LINE__,
// (unsigned long int)Queue,
// (unsigned long int)Queue->LastCommandEvent);

bool IncludeLastCommandEvent = (LastCommandEventIncluded != nullptr);
if (UseDriverInOrderLists) {
IncludeLastCommandEvent &=
Queue->LastUsedCommandList != Queue->CommandListMap.end() &&
CommandList->first != Queue->LastUsedCommandList->first;
}

// printf("JAIME %s %d LastCommandEventIncluded 0x%lx TmpWaitList.Length %d
// Queue %lx LastCommandEvent %lx\n",
// __FILE__, __LINE__,
// (unsigned long int)LastCommandEventIncluded,
// TmpWaitList.Length,
// (unsigned long int)Queue,
// (unsigned long int)Queue->LastCommandEvent);

if (IncludeLastCommandEvent) {
std::shared_lock<ur_shared_mutex> Lock(LastCommandEventIncluded->Mutex);
TmpWaitList.ZeEventList[TmpWaitList.Length - 1] =
LastCommandEventIncluded->ZeEvent;
TmpWaitList.UrEventList[TmpWaitList.Length - 1] = LastCommandEventIncluded;
TmpWaitList.UrEventList[TmpWaitList.Length - 1]->RefCount.increment();
} else if (TmpWaitList.Length > 0) {
TmpWaitList.Length--;
}

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent;
bool IsInternal = OutEvent == nullptr;
Expand Down Expand Up @@ -102,7 +143,7 @@ ur_result_t enqueueMemCopyRectHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -214,7 +255,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

ur_command_list_ptr_t CommandList{};
// We want to batch these commands to avoid extra submissions (costly)
Expand Down Expand Up @@ -346,7 +387,7 @@ static ur_result_t enqueueMemImageCommandHelper(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// We want to batch these commands to avoid extra submissions (costly)
bool OkToBatch = true;
Expand Down Expand Up @@ -909,7 +950,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(
createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP,
Expand All @@ -931,7 +972,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
} else if (MapFlags & UR_MAP_FLAG_WRITE)
AccessMode = ur_mem_handle_t_::write_only;
}

UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown,
UR_RESULT_ERROR_INVALID_VALUE);

Expand Down Expand Up @@ -1067,7 +1107,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
Queue->CommandListMap.end(),
Expand Down Expand Up @@ -1247,7 +1287,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
//
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down Expand Up @@ -1302,7 +1342,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(

_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue,
UseCopyEngine));
UseCopyEngine, nullptr));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
Expand Down
Loading

0 comments on commit d0a2e28

Please sign in to comment.