Skip to content

Commit

Permalink
Merge pull request #1035 from Bensuo/cmd-buffer-profiling-l0
Browse files Browse the repository at this point in the history
[EXP][CMDBUF] Add support for recovering CommandBuffer profiling information
  • Loading branch information
kbenzie authored Jan 11, 2024
2 parents e1414e1 + 35b6a5e commit d06ba9d
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 29 deletions.
87 changes: 58 additions & 29 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
└──────────┴────────────────────────────────────────────────┴─────────┘
┌───────────────────┬──────────────────────────────┐
Prefix │Reset signal event │ Barrier waiting on wait event│
└───────────────────┴──────────────────────────────┘
┌───────────────────┬──────────────┐──────────────────────────────┐
Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
└───────────────────┴──────────────┘──────────────────────────────┘
┌─────────────────────────────────────────────┐──────────────┐
Suffix │Barrier waiting on sync-point event, │ Reset events
│signalling the UR command-buffer signal event│
Suffix │Barrier waiting on sync-point event, │ Query CMD
│signalling the UR command-buffer signal event│ Timestamps
└─────────────────────────────────────────────┘──────────────┘
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
Expand Down Expand Up @@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,

ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
// Dependencies between commands are explicitly enforced by sync points when
// enqueuing. Consequently, relax the command ordering in the command list
// can enable the backend to further optimize the workload
ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;

ze_command_list_handle_t ZeCommandList;
// TODO We could optimize this by pooling both Level Zero command-lists and UR
Expand Down Expand Up @@ -494,18 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
(CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent,
NumEvents, WaitEventList.data()));

// Reset the wait-event for the UR command-buffer that is signalled when its
// submission dependencies have been satisfied.
ZE2UR_CALL(zeCommandListAppendEventReset,
(CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));

// Reset the L0 events we use for command-buffer internal sync-points to the
// non-signalled state
for (auto Event : WaitEventList) {
ZE2UR_CALL(zeCommandListAppendEventReset,
(CommandBuffer->ZeCommandList, Event));
}

// Close the command list and have it ready for dispatch.
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -899,14 +891,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
// Create command-list to execute before `CommandListPtr` and will signal
// when `EventWaitList` dependencies are complete.
ur_command_list_ptr_t WaitCommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false,
false));

// Create a list of events of all the events that compose the command buffer
// workload.
// This loop also resets the L0 events we use for command-buffer internal
// sync-points to the non-signalled state.
// This is required for multiple submissions.
const size_t NumEvents = CommandBuffer->SyncPoints.size();
std::vector<ze_event_handle_t> WaitEventList{NumEvents};
for (size_t i = 0; i < NumEvents; i++) {
auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent;
WaitEventList[i] = ZeEvent;
ZE2UR_CALL(zeCommandListAppendEventReset,
(WaitCommandList->first, ZeEvent));
}

if (NumEventsInWaitList) {
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false))

// Update the WaitList of the Wait Event
// Events are appended to the WaitList if the WaitList is not empty
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
Expand All @@ -919,9 +925,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
CommandBuffer->WaitEvent->WaitList.Length,
CommandBuffer->WaitEvent->WaitList.ZeEventList));
} else {
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
false, false));

ZE2UR_CALL(zeCommandListAppendSignalEvent,
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
}
Expand All @@ -930,17 +933,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
ur_event_handle_t RetEvent{};
// Create a command-list to signal RetEvent on completion
ur_command_list_ptr_t SignalCommandList{};
if (Event) {
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
false, false));
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
false, false));
// Reset the wait-event for the UR command-buffer that is signalled when its
// submission dependencies have been satisfied.
ZE2UR_CALL(zeCommandListAppendEventReset,
(SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent));

if (Event) {
UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent,
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
SignalCommandList, false));

ZE2UR_CALL(zeCommandListAppendBarrier,
(SignalCommandList->first, RetEvent->ZeEvent, 1,
&(CommandBuffer->SignalEvent->ZeEvent)));
if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
// Multiple submissions of a command buffer implies that we need to save
// the event timestamps before resubmiting the command buffer. We
// therefore copy the these timestamps in a dedicated USM memory section
// before completing the command buffer execution, and then attach this
// memory to the event returned to users to allow to allow the profiling
// engine to recover these timestamps.
command_buffer_profiling_t *Profiling = new command_buffer_profiling_t();

Profiling->NumEvents = WaitEventList.size();
Profiling->Timestamps =
new ze_kernel_timestamp_result_t[Profiling->NumEvents];

ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
(SignalCommandList->first, WaitEventList.size(),
WaitEventList.data(), (void *)Profiling->Timestamps, 0,
RetEvent->ZeEvent, 1,
&(CommandBuffer->SignalEvent->ZeEvent)));

RetEvent->CommandData = static_cast<void *>(Profiling);
} else {
ZE2UR_CALL(zeCommandListAppendBarrier,
(SignalCommandList->first, RetEvent->ZeEvent, 1,
&(CommandBuffer->SignalEvent->ZeEvent)));
}
}

// Execution our command-lists asynchronously
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
#include "context.hpp"
#include "queue.hpp"

struct command_buffer_profiling_t {
ur_exp_command_buffer_sync_point_t NumEvents;
ze_kernel_timestamp_result_t *Timestamps;
};

struct ur_exp_command_buffer_handle_t_ : public _ur_object {
ur_exp_command_buffer_handle_t_(ur_context_handle_t Context,
ur_device_handle_t Device,
Expand Down
75 changes: 75 additions & 0 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <mutex>
#include <string.h>

#include "command_buffer.hpp"
#include "common.hpp"
#include "event.hpp"
#include "ur_level_zero.hpp"
Expand Down Expand Up @@ -454,6 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
///< bytes returned in propValue
) {
std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);

if (Event->UrQueue &&
(Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
Expand All @@ -470,6 +472,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(

ze_kernel_timestamp_result_t tsResult;

// A Command-buffer consists of three command-lists for which only a single
// event is returned to users. The actual profiling information related to the
// command-buffer should therefore be extrated from graph events themsleves.
// The timestamps of these events are saved in a memory region attached to
// event usning CommandData field. The timings must therefore be recovered
// from this memory.
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
if (Event->CommandData) {
command_buffer_profiling_t *ProfilingsPtr;
switch (PropName) {
case UR_PROFILING_INFO_COMMAND_START: {
ProfilingsPtr =
static_cast<command_buffer_profiling_t *>(Event->CommandData);
// Sync-point order does not necessarily match to the order of
// execution. We therefore look for the first command executed.
uint64_t MinStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
if (Timestamp < MinStart) {
MinStart = Timestamp;
}
}
uint64_t ContextStartTime =
(MinStart & TimestampMaxValue) * ZeTimerResolution;
return ReturnValue(ContextStartTime);
}
case UR_PROFILING_INFO_COMMAND_END: {
ProfilingsPtr =
static_cast<command_buffer_profiling_t *>(Event->CommandData);
// Sync-point order does not necessarily match to the order of
// execution. We therefore look for the last command executed.
uint64_t MaxEnd = ProfilingsPtr->Timestamps[0].global.kernelEnd;
uint64_t LastStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
if (Timestamp > MaxEnd) {
MaxEnd = Timestamp;
LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart;
}
}
uint64_t ContextStartTime = (LastStart & TimestampMaxValue);
uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue);

//
// Handle a possible wrap-around (the underlying HW counter is <
// 64-bit). Note, it will not report correct time if there were multiple
// wrap arounds, and the longer term plan is to enlarge the capacity of
// the HW timestamps.
//
if (ContextEndTime <= ContextStartTime) {
ContextEndTime += TimestampMaxValue;
}
ContextEndTime *= ZeTimerResolution;
return ReturnValue(ContextEndTime);
}
default:
urPrint("urEventGetProfilingInfo: not supported ParamName\n");
return UR_RESULT_ERROR_INVALID_VALUE;
}
} else {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
}
}

switch (PropName) {
case UR_PROFILING_INFO_COMMAND_START: {
ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
Expand Down Expand Up @@ -763,6 +829,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
return Res;
Event->CommandData = nullptr;
}
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
Event->CommandData) {
// Free the memory extra event allocated for profiling purposed.
command_buffer_profiling_t *ProfilingPtr =
static_cast<command_buffer_profiling_t *>(Event->CommandData);
delete[] ProfilingPtr->Timestamps;
delete ProfilingPtr;
Event->CommandData = nullptr;
}
if (Event->OwnNativeHandle) {
if (DisableEventsCaching) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
Expand Down

0 comments on commit d06ba9d

Please sign in to comment.