Skip to content

Commit

Permalink
Copy command-buffer event timestamps into a dedicated USM memory region.
Browse files Browse the repository at this point in the history
Get the command-buffer start and end timestamps from this memory.
Move events reset from the middle command list to the first to allow the copy of the profiling info in the last command list and relax command list order.
  • Loading branch information
mfrancepillois committed Nov 9, 2023
1 parent 4808c63 commit 68e1d24
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 46 deletions.
92 changes: 58 additions & 34 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
└──────────┴────────────────────────────────────────────────┴─────────┘
┌───────────────────┬──────────────────────────────┐
Prefix │Reset signal event │ Barrier waiting on wait event│
└───────────────────┴──────────────────────────────┘
┌───────────────────┬──────────────┐──────────────────────────────┐
Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
└───────────────────┴──────────────┘──────────────────────────────┘
┌─────────────────────────────────────────────┐──────────────┐
Suffix │Barrier waiting on sync-point event, │ Reset events
│signalling the UR command-buffer signal event│
Suffix │Barrier waiting on sync-point event, │ Query CMD
│signalling the UR command-buffer signal event│ Timestamps
└─────────────────────────────────────────────┘──────────────┘
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
Expand Down Expand Up @@ -391,6 +391,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,

ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
// Dependencies between commands are explicitly enforced by sync points when
// enqueuing. Consequently, relax the command ordering in the command list
// can enable the backend to further optimize the workload
ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;

ze_command_list_handle_t ZeCommandList;
// TODO We could optimize this by pooling both Level Zero command-lists and UR
Expand Down Expand Up @@ -457,13 +461,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
ZE2UR_CALL(zeCommandListAppendEventReset,
(CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));

// Reset the L0 events we use for command-buffer internal sync-points to the
// non-signalled state
for (auto Event : WaitEventList) {
ZE2UR_CALL(zeCommandListAppendEventReset,
(CommandBuffer->ZeCommandList, Event));
}

// Close the command list and have it ready for dispatch.
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -722,14 +719,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
// Create command-list to execute before `CommandListPtr` and will signal
// when `EventWaitList` dependencies are complete.
ur_command_list_ptr_t WaitCommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false,
false));

// Create a list of events of all the events that compose the command buffer
// workload.
// This loop also resets the L0 events we use for command-buffer internal
// sync-points to the non-signalled state.
// This is required for multiple submissions.
const size_t NumEvents = CommandBuffer->SyncPoints.size();
std::vector<ze_event_handle_t> WaitEventList{NumEvents};
for (size_t i = 0; i < NumEvents; i++) {
auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent;
WaitEventList[i] = ZeEvent;
ZE2UR_CALL(zeCommandListAppendEventReset,
(WaitCommandList->first, ZeEvent));
}

if (NumEventsInWaitList) {
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false))

// Update the WaitList of the Wait Event
// Events are appended to the WaitList if the WaitList is not empty
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
Expand All @@ -742,9 +753,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
CommandBuffer->WaitEvent->WaitList.Length,
CommandBuffer->WaitEvent->WaitList.ZeEventList));
} else {
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false));

ZE2UR_CALL(zeCommandListAppendSignalEvent,
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
}
Expand All @@ -766,22 +774,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
&(CommandBuffer->SignalEvent->ZeEvent)));

if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
// We create an additional signal specific to the current execution of the
// CommandBuffer. This signal is needed for profiling the execution time
// of the CommandBuffer. It waits for the WaitEvent to be signaled
// which indicates the start of the CommandBuffer actual execution.
// This event is embedded into the Event return to the user to allow
// the profiling engine to retrieve it.
ur_event_handle_t StartEvent{};
UR_CALL(createEventAndAssociateQueue(
Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
WaitCommandList, false));

ZE2UR_CALL(zeCommandListAppendBarrier,
(WaitCommandList->first, StartEvent->ZeEvent, 1,
&(CommandBuffer->WaitEvent->ZeEvent)));

RetEvent->CommandData = StartEvent;
// Multiple submissions of a command buffer implies that we need to save
// the event timestamps before resubmiting the command buffer. We
// therefore copy the these timestamps in a dedicated USM memory section
// before completing the command buffer execution, and then attach this
// memory to the event returned to users to allow to allow the profiling
// engine to recover these timestamps.
ur_usm_desc_t USMDesc{};
ur_usm_device_desc_t UsmDeviceDesc{};
UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
ur_usm_host_desc_t UsmHostDesc{};
UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
UsmDeviceDesc.pNext = &UsmHostDesc;
USMDesc.pNext = &UsmDeviceDesc;
USMDesc.align = 4; // 4byte-aligned

size_t Size = WaitEventList.size() * sizeof(ze_kernel_timestamp_result_t);

struct command_buffer_profiling_t *Profiling =
new command_buffer_profiling_t();

Profiling->NumEvents = WaitEventList.size();

urUSMSharedAlloc(RetEvent->Context, CommandBuffer->Device, &USMDesc,
nullptr, Size, (void **)&Profiling->Timestamps);

ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
(SignalCommandList->first, WaitEventList.size(),
WaitEventList.data(), Profiling->Timestamps, 0,
RetEvent->ZeEvent, 1,
&(CommandBuffer->SignalEvent->ZeEvent)));

RetEvent->CommandData = static_cast<void *>(Profiling);
}
}

Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
#include "context.hpp"
#include "queue.hpp"

struct command_buffer_profiling_t {
ur_exp_command_buffer_sync_point_t NumEvents;
ze_kernel_timestamp_result_t *Timestamps;
};

struct ur_exp_command_buffer_handle_t_ : public _ur_object {
ur_exp_command_buffer_handle_t_(ur_context_handle_t Context,
ur_device_handle_t Device,
Expand Down
80 changes: 68 additions & 12 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,16 +449,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
) {
std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);

// A Command-buffer consists of three command-lists.
// The start time should therefore be taken from an event associated
// to the first command-list.
if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) &&
(PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) {
auto StartEvent = static_cast<ur_event_handle_t>(Event->CommandData);
return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END,
PropValueSize, PropValue, PropValueSizeRet);
}

if (Event->UrQueue &&
(Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
Expand All @@ -475,6 +465,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(

ze_kernel_timestamp_result_t tsResult;

// A Command-buffer consists of three command-lists for which only a single
// event is returned to users. The actual profiling information related to the
// command-buffer should therefore be extrated from graph events themsleves.
// The timestamps of these events are saved in a memory region attached to
// event usning CommandData field. The timings must therefore be recovered
// from this memory.
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
if (Event->CommandData) {
struct command_buffer_profiling_t *ProfilingsPtr;
switch (PropName) {
case UR_PROFILING_INFO_COMMAND_START: {
ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
Event->CommandData);
// Sync-point order does not necessarily match to the order of
// execution. We therefore look for the first command executed.
uint64_t MinStart = ProfilingsPtr->Timestamps->global.kernelStart;
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
if (Timestamp < MinStart) {
MinStart = Timestamp;
}
}
uint64_t ContextStartTime =
(MinStart & TimestampMaxValue) * ZeTimerResolution;
return ReturnValue(ContextStartTime);
}
case UR_PROFILING_INFO_COMMAND_END: {
ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
Event->CommandData);
// Sync-point order does not necessarily match to the order of
// execution. We therefore look for the last command executed.
uint64_t MaxEnd = ProfilingsPtr->Timestamps->global.kernelEnd;
uint64_t LastStart = ProfilingsPtr->Timestamps->global.kernelStart;
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
if (Timestamp > MaxEnd) {
MaxEnd = Timestamp;
LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart;
}
}
uint64_t ContextStartTime = (LastStart & TimestampMaxValue);
uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue);

//
// Handle a possible wrap-around (the underlying HW counter is <
// 64-bit). Note, it will not report correct time if there were multiple
// wrap arounds, and the longer term plan is to enlarge the capacity of
// the HW timestamps.
//
if (ContextEndTime <= ContextStartTime) {
ContextEndTime += TimestampMaxValue;
}
ContextEndTime *= ZeTimerResolution;
return ReturnValue(ContextEndTime);
}
default:
urPrint("urEventGetProfilingInfo: not supported ParamName\n");
return UR_RESULT_ERROR_INVALID_VALUE;
}
} else {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
}
}

switch (PropName) {
case UR_PROFILING_INFO_COMMAND_START: {
ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
Expand Down Expand Up @@ -771,8 +825,10 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
Event->CommandData) {
// Free the memory extra event allocated for profiling purposed.
auto AssociateEvent = static_cast<ur_event_handle_t>(Event->CommandData);
urEventRelease(AssociateEvent);
struct command_buffer_profiling_t *ProfilingPtr =
static_cast<struct command_buffer_profiling_t *>(Event->CommandData);
urUSMFree(Event->Context, (void *)ProfilingPtr->Timestamps);
delete ProfilingPtr;
Event->CommandData = nullptr;
}
if (Event->OwnNativeHandle) {
Expand Down

0 comments on commit 68e1d24

Please sign in to comment.