Skip to content

Commit

Permalink
[SYCL] Support profiling info for event returned by NOP barrier (#12949)
Browse files Browse the repository at this point in the history
Currently if ext_oneapi_barrier without waitlist is submitted to the
in-order queue that doesn't have the last command (empty queue) then we
return default constructed event which doesn't have profiling info
because it is not associated with any queue. Associate such event with
the queue and record submission time which is equal to the start time
and the end time for such event because it basically corresponds to NOP.
  • Loading branch information
againull committed Mar 9, 2024
1 parent 2488da1 commit 200694b
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 34 deletions.
23 changes: 19 additions & 4 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,15 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
}
}

event_impl::event_impl(const QueueImplPtr &Queue)
: MQueue{Queue},
MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled},
MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} {
event_impl::event_impl(const QueueImplPtr &Queue) {
this->setContextImpl(Queue->getContextImplPtr());
this->associateWithQueue(Queue);
}

void event_impl::associateWithQueue(const QueueImplPtr &Queue) {
MQueue = Queue;
MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled;
MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
if (Queue->is_host()) {
MState.store(HES_NotComplete);
if (Queue->has_property<property::queue::enable_profiling>()) {
Expand Down Expand Up @@ -284,6 +288,7 @@ template <>
uint64_t
event_impl::get_profiling_info<info::event_profiling::command_submit>() {
checkProfilingPreconditions();

// The delay between the submission and the actual start of a CommandBuffer
// can be short. Consequently, the submission time, which is based on
// an estimated clock and not on the real device clock, may be ahead of the
Expand Down Expand Up @@ -312,6 +317,11 @@ template <>
uint64_t
event_impl::get_profiling_info<info::event_profiling::command_start>() {
checkProfilingPreconditions();

// For nop command start time is equal to submission time.
if (isNOP() && MSubmitTime)
return MSubmitTime;

if (!MHostEvent) {
if (MEvent) {
auto StartTime =
Expand Down Expand Up @@ -339,6 +349,11 @@ event_impl::get_profiling_info<info::event_profiling::command_start>() {
template <>
uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
checkProfilingPreconditions();

// For nop command end time is equal to submission time.
if (isNOP() && MSubmitTime)
return MSubmitTime;

if (!MHostEvent) {
if (MEvent) {
auto EndTime =
Expand Down
15 changes: 13 additions & 2 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,17 @@ class event_impl {
MSubmittedQueue = SubmittedQueue;
};

/// Associate event with provided queue.
///
/// @return
void associateWithQueue(const QueueImplPtr &Queue);

/// Indicates if this event is not associated with any command and doesn't
/// have native handle.
///
/// @return true if no associated command and no event handle.
bool isNOP() { return !MCommand && !getHandleRef(); }

/// Calling this function queries the current device timestamp and sets it as
/// submission time for the command associated with this event.
void setSubmissionTime();
Expand Down Expand Up @@ -316,8 +327,8 @@ class event_impl {
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
void *MCommand = nullptr;
std::weak_ptr<queue_impl> MQueue;
const bool MIsProfilingEnabled = false;
const bool MFallbackProfiling = false;
bool MIsProfilingEnabled = false;
bool MFallbackProfiling = false;

std::weak_ptr<queue_impl> MWorkerQueue;
std::weak_ptr<queue_impl> MSubmittedQueue;
Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
// throwaway events created with empty constructor will not have a context
// (which is set lazily) calling getContextImpl() would set that
// context, which we wish to avoid as it is expensive.
if (!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) {
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
continue;
}
// The fusion command and its event are associated with a non-host context,
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,10 @@ areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
// Events that don't have an initialized context are throwaway events that
// don't represent actual dependencies. Calling getContextImpl() would set
// their context, which we wish to avoid as it is expensive.
if (!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) {
// NOP events also don't represent actual dependencies.
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
return true;
}
if (SyclEventImplPtr->is_host()) {
Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,9 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
// Throwaway events created with empty constructor will not have a context
// (which is set lazily) calling getContextImpl() would set that
// context, which we wish to avoid as it is expensive.
// Skip host task also.
if (!EventImpl->isContextInitialized() || EventImpl->is_host())
// Skip host task and NOP events also.
if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
EventImpl->isNOP())
continue;
// In this path nullptr native event means that the command has not been
// enqueued. It may happen if async enqueue in a host task is involved.
Expand Down
57 changes: 35 additions & 22 deletions sycl/source/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,34 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
impl->wait_and_throw(CodeLoc);
}

static event
getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = QueueImpl->getCommandGraph()) {
auto LastEvent =
Graph->getEventForNode(Graph->getLastInorderNode(QueueImpl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
auto LastEvent = QueueImpl->getLastEvent();
if (QueueImpl->MDiscardEvents) {
std::cout << "Discard event enabled" << std::endl;
return LastEvent;
}

auto LastEventImpl = detail::getSyclObjImpl(LastEvent);
// If last event is default constructed event then we want to associate it
// with the queue and record submission time if profiling is enabled. Such
// event corresponds to NOP and its submit time is same as start time and
// end time.
if (!LastEventImpl->isContextInitialized()) {
LastEventImpl->associateWithQueue(QueueImpl);
LastEventImpl->setSubmissionTime();
}
return detail::createSyclObjFromImpl<event>(LastEventImpl);
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all commands previously submitted to this queue have entered the
/// complete state.
Expand All @@ -213,16 +241,8 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
/// \return a SYCL event object, which corresponds to the queue the command
/// group is being enqueued on.
event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
if (is_in_order()) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = impl->getCommandGraph()) {
auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
return impl->getLastEvent();
}
if (is_in_order())
return getBarrierEventForInorderQueueHelper(impl);

return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc);
}
Expand All @@ -238,20 +258,13 @@ event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
/// group is being enqueued on.
event queue::ext_oneapi_submit_barrier(const std::vector<event> &WaitList,
const detail::code_location &CodeLoc) {
bool AllEventsEmpty = std::all_of(
bool AllEventsEmptyOrNop = std::all_of(
begin(WaitList), end(WaitList), [&](const event &Event) -> bool {
return !detail::getSyclObjImpl(Event)->isContextInitialized();
return !detail::getSyclObjImpl(Event)->isContextInitialized() ||
detail::getSyclObjImpl(Event)->isNOP();
});
if (is_in_order() && AllEventsEmpty) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = impl->getCommandGraph()) {
auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
return impl->getLastEvent();
}
if (is_in_order() && AllEventsEmptyOrNop)
return getBarrierEventForInorderQueueHelper(impl);

return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); },
CodeLoc);
Expand Down
36 changes: 36 additions & 0 deletions sycl/test-e2e/Regression/nop_event_profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Test to check that it is possible to get profiling info from the event
// returned by barrier which turns into NOP.

#include <sycl/sycl.hpp>

int main() {
sycl::event start;
sycl::event stop;
sycl::queue q{sycl::property_list(sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling())};
float elapsed = 0;

start = q.ext_oneapi_submit_barrier();
std::cout << "before parallel_for" << std::endl;
q.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16),
sycl::range<3>(1, 1, 16)),
[=](sycl::nd_item<3> item_ct1) {
double d = 123;
for (int i = 0; i < 10000; i++) {
d = d * i;
}
});
std::cout << "after parallel_for" << std::endl;
stop = q.ext_oneapi_submit_barrier();
stop.wait_and_throw();
elapsed =
(stop.get_profiling_info<sycl::info::event_profiling::command_end>() -
start.get_profiling_info<sycl::info::event_profiling::command_start>()) /
1000000.0f;
std::cout << "elapsed:" << elapsed << std::endl;
return 0;
}

0 comments on commit 200694b

Please sign in to comment.