Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Support profiling info for event returned by NOP barrier #12949

Merged
merged 2 commits into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,15 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
}
}

event_impl::event_impl(const QueueImplPtr &Queue)
: MQueue{Queue},
MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled},
MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} {
event_impl::event_impl(const QueueImplPtr &Queue) {
this->setContextImpl(Queue->getContextImplPtr());
this->associateWithQueue(Queue);
}

void event_impl::associateWithQueue(const QueueImplPtr &Queue) {
MQueue = Queue;
MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled;
MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
if (Queue->is_host()) {
MState.store(HES_NotComplete);
if (Queue->has_property<property::queue::enable_profiling>()) {
Expand Down Expand Up @@ -284,6 +288,7 @@ template <>
uint64_t
event_impl::get_profiling_info<info::event_profiling::command_submit>() {
checkProfilingPreconditions();

// The delay between the submission and the actual start of a CommandBuffer
// can be short. Consequently, the submission time, which is based on
// an estimated clock and not on the real device clock, may be ahead of the
Expand Down Expand Up @@ -312,6 +317,11 @@ template <>
uint64_t
event_impl::get_profiling_info<info::event_profiling::command_start>() {
checkProfilingPreconditions();

// For nop command start time is equal to submission time.
if (isNOP() && MSubmitTime)
return MSubmitTime;

if (!MHostEvent) {
if (MEvent) {
auto StartTime =
Expand Down Expand Up @@ -339,6 +349,11 @@ event_impl::get_profiling_info<info::event_profiling::command_start>() {
template <>
uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
checkProfilingPreconditions();

// For nop command end time is equal to submission time.
if (isNOP() && MSubmitTime)
return MSubmitTime;

if (!MHostEvent) {
if (MEvent) {
auto EndTime =
Expand Down
15 changes: 13 additions & 2 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,17 @@ class event_impl {
MSubmittedQueue = SubmittedQueue;
};

/// Associate event with provided queue.
///
/// @return
void associateWithQueue(const QueueImplPtr &Queue);

/// Indicates if this event is not associated with any command and doesn't
/// have native handle.
///
/// @return true if no associated command and no event handle.
bool isNOP() { return !MCommand && !getHandleRef(); }

/// Calling this function queries the current device timestamp and sets it as
/// submission time for the command associated with this event.
void setSubmissionTime();
Expand Down Expand Up @@ -316,8 +327,8 @@ class event_impl {
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
void *MCommand = nullptr;
std::weak_ptr<queue_impl> MQueue;
const bool MIsProfilingEnabled = false;
const bool MFallbackProfiling = false;
bool MIsProfilingEnabled = false;
bool MFallbackProfiling = false;

std::weak_ptr<queue_impl> MWorkerQueue;
std::weak_ptr<queue_impl> MSubmittedQueue;
Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
// throwaway events created with empty constructor will not have a context
// (which is set lazily) calling getContextImpl() would set that
// context, which we wish to avoid as it is expensive.
if (!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) {
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
continue;
}
// The fusion command and its event are associated with a non-host context,
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,10 @@ areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
// Events that don't have an initialized context are throwaway events that
// don't represent actual dependencies. Calling getContextImpl() would set
// their context, which we wish to avoid as it is expensive.
if (!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) {
// NOP events also don't represent actual dependencies.
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
return true;
}
if (SyclEventImplPtr->is_host()) {
Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,9 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
// Throwaway events created with empty constructor will not have a context
// (which is set lazily) calling getContextImpl() would set that
// context, which we wish to avoid as it is expensive.
// Skip host task also.
if (!EventImpl->isContextInitialized() || EventImpl->is_host())
// Skip host task and NOP events also.
if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
EventImpl->isNOP())
continue;
// In this path nullptr native event means that the command has not been
// enqueued. It may happen if async enqueue in a host task is involved.
Expand Down
57 changes: 35 additions & 22 deletions sycl/source/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,34 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
impl->wait_and_throw(CodeLoc);
}

static event
getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = QueueImpl->getCommandGraph()) {
auto LastEvent =
Graph->getEventForNode(Graph->getLastInorderNode(QueueImpl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
auto LastEvent = QueueImpl->getLastEvent();
if (QueueImpl->MDiscardEvents) {
std::cout << "Discard event enabled" << std::endl;
return LastEvent;
}

auto LastEventImpl = detail::getSyclObjImpl(LastEvent);
// If last event is default constructed event then we want to associate it
// with the queue and record submission time if profiling is enabled. Such
// event corresponds to NOP and its submit time is same as start time and
// end time.
if (!LastEventImpl->isContextInitialized()) {
LastEventImpl->associateWithQueue(QueueImpl);
LastEventImpl->setSubmissionTime();
}
return detail::createSyclObjFromImpl<event>(LastEventImpl);
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all commands previously submitted to this queue have entered the
/// complete state.
Expand All @@ -213,16 +241,8 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
/// \return a SYCL event object, which corresponds to the queue the command
/// group is being enqueued on.
event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
if (is_in_order()) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = impl->getCommandGraph()) {
auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
return impl->getLastEvent();
}
if (is_in_order())
return getBarrierEventForInorderQueueHelper(impl);

return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc);
}
Expand All @@ -238,20 +258,13 @@ event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
/// group is being enqueued on.
event queue::ext_oneapi_submit_barrier(const std::vector<event> &WaitList,
const detail::code_location &CodeLoc) {
bool AllEventsEmpty = std::all_of(
bool AllEventsEmptyOrNop = std::all_of(
begin(WaitList), end(WaitList), [&](const event &Event) -> bool {
return !detail::getSyclObjImpl(Event)->isContextInitialized();
return !detail::getSyclObjImpl(Event)->isContextInitialized() ||
detail::getSyclObjImpl(Event)->isNOP();
});
if (is_in_order() && AllEventsEmpty) {
// The last command recorded in the graph is not tracked by the queue but by
// the graph itself. We must therefore search for the last node/event in the
// graph.
if (auto Graph = impl->getCommandGraph()) {
auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl));
return sycl::detail::createSyclObjFromImpl<event>(LastEvent);
}
return impl->getLastEvent();
}
if (is_in_order() && AllEventsEmptyOrNop)
return getBarrierEventForInorderQueueHelper(impl);

return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); },
CodeLoc);
Expand Down
36 changes: 36 additions & 0 deletions sycl/test-e2e/Regression/nop_event_profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Test to check that it is possible to get profiling info from the event
// returned by barrier which turns into NOP.

#include <sycl/sycl.hpp>

int main() {
sycl::event start;
sycl::event stop;
sycl::queue q{sycl::property_list(sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling())};
float elapsed = 0;

start = q.ext_oneapi_submit_barrier();
std::cout << "before parallel_for" << std::endl;
q.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16),
sycl::range<3>(1, 1, 16)),
[=](sycl::nd_item<3> item_ct1) {
double d = 123;
for (int i = 0; i < 10000; i++) {
d = d * i;
}
});
std::cout << "after parallel_for" << std::endl;
stop = q.ext_oneapi_submit_barrier();
stop.wait_and_throw();
elapsed =
(stop.get_profiling_info<sycl::info::event_profiling::command_end>() -
start.get_profiling_info<sycl::info::event_profiling::command_start>()) /
1000000.0f;
std::cout << "elapsed:" << elapsed << std::endl;
return 0;
}
Loading