From 200694b09b1a7c88bda9d1aee9f35005ab4c8dd2 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 8 Mar 2024 16:04:21 -0800 Subject: [PATCH] [SYCL] Support profiling info for event returned by NOP barrier (#12949) Currently if ext_oneapi_barrier without waitlist is submitted to the in-order queue that doesn't have the last command (empty queue) then we return default constructed event which doesn't have profiling info because it is not associated with any queue. Associate such event with the queue and record submission time which is equal to the start time and the end time for such event because it basically corresponds to NOP. --- sycl/source/detail/event_impl.cpp | 23 ++++++-- sycl/source/detail/event_impl.hpp | 15 ++++- sycl/source/detail/helpers.cpp | 5 +- sycl/source/detail/queue_impl.cpp | 6 +- sycl/source/detail/scheduler/commands.cpp | 5 +- sycl/source/queue.cpp | 57 ++++++++++++------- .../Regression/nop_event_profiling.cpp | 36 ++++++++++++ 7 files changed, 113 insertions(+), 34 deletions(-) create mode 100644 sycl/test-e2e/Regression/nop_event_profiling.cpp diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 05e2481de75bc..2be79be727369 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -154,11 +154,15 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } } -event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, - MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled}, - MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} { +event_impl::event_impl(const QueueImplPtr &Queue) { this->setContextImpl(Queue->getContextImplPtr()); + this->associateWithQueue(Queue); +} + +void event_impl::associateWithQueue(const QueueImplPtr &Queue) { + MQueue = Queue; + MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled; + MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback(); if (Queue->is_host()) { MState.store(HES_NotComplete); if (Queue->has_property()) { @@ -284,6 +288,7 @@ template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + // The delay between the submission and the actual start of a CommandBuffer // can be short. Consequently, the submission time, which is based on // an estimated clock and not on the real device clock, may be ahead of the @@ -312,6 +317,11 @@ template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + + // For nop command start time is equal to submission time. + if (isNOP() && MSubmitTime) + return MSubmitTime; + if (!MHostEvent) { if (MEvent) { auto StartTime = @@ -339,6 +349,11 @@ event_impl::get_profiling_info() { template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + + // For nop command end time is equal to submission time. + if (isNOP() && MSubmitTime) + return MSubmitTime; + if (!MHostEvent) { if (MEvent) { auto EndTime = diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index ed5b9ef07ef91..4a7467691127c 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -223,6 +223,17 @@ class event_impl { MSubmittedQueue = SubmittedQueue; }; + /// Associate event with provided queue. + /// + /// @return + void associateWithQueue(const QueueImplPtr &Queue); + + /// Indicates if this event is not associated with any command and doesn't + /// have native handle. + /// + /// @return true if no associated command and no event handle. + bool isNOP() { return !MCommand && !getHandleRef(); } + /// Calling this function queries the current device timestamp and sets it as /// submission time for the command associated with this event. void setSubmissionTime(); @@ -316,8 +327,8 @@ class event_impl { std::unique_ptr MHostProfilingInfo; void *MCommand = nullptr; std::weak_ptr MQueue; - const bool MIsProfilingEnabled = false; - const bool MFallbackProfiling = false; + bool MIsProfilingEnabled = false; + bool MFallbackProfiling = false; std::weak_ptr MWorkerQueue; std::weak_ptr MSubmittedQueue; diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 222b06127207d..1bdb2ddbd4697 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -31,8 +31,9 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - if (!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) { + if ((!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) || + SyclEventImplPtr->isNOP()) { continue; } // The fusion command and its event are associated with a non-host context, diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bc8c26ca88cb0..f819b35053258 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -290,8 +290,10 @@ areEventsSafeForSchedulerBypass(const std::vector &DepEvents, // Events that don't have an initialized context are throwaway events that // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. - if (!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) { + // NOP events also don't represent actual dependencies. + if ((!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) || + SyclEventImplPtr->isNOP()) { return true; } if (SyclEventImplPtr->is_host()) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b113da757bd0c..8777b82db1f6b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -263,8 +263,9 @@ std::vector Command::getPiEventsBlocking( // Throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - // Skip host task also. - if (!EventImpl->isContextInitialized() || EventImpl->is_host()) + // Skip host task and NOP events also. + if (!EventImpl->isContextInitialized() || EventImpl->is_host() || + EventImpl->isNOP()) continue; // In this path nullptr native event means that the command has not been // enqueued. It may happen if async enqueue in a host task is involved. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 3fee25b7236a2..acaecf2696629 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -205,6 +205,34 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { impl->wait_and_throw(CodeLoc); } +static event +getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) { + // The last command recorded in the graph is not tracked by the queue but by + // the graph itself. We must therefore search for the last node/event in the + // graph. + if (auto Graph = QueueImpl->getCommandGraph()) { + auto LastEvent = + Graph->getEventForNode(Graph->getLastInorderNode(QueueImpl)); + return sycl::detail::createSyclObjFromImpl(LastEvent); + } + auto LastEvent = QueueImpl->getLastEvent(); + if (QueueImpl->MDiscardEvents) { + std::cout << "Discard event enabled" << std::endl; + return LastEvent; + } + + auto LastEventImpl = detail::getSyclObjImpl(LastEvent); + // If last event is default constructed event then we want to associate it + // with the queue and record submission time if profiling is enabled. Such + // event corresponds to NOP and its submit time is same as start time and + // end time. + if (!LastEventImpl->isContextInitialized()) { + LastEventImpl->associateWithQueue(QueueImpl); + LastEventImpl->setSubmissionTime(); + } + return detail::createSyclObjFromImpl(LastEventImpl); +} + /// Prevents any commands submitted afterward to this queue from executing /// until all commands previously submitted to this queue have entered the /// complete state. @@ -213,16 +241,8 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { /// \return a SYCL event object, which corresponds to the queue the command /// group is being enqueued on. event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) { - if (is_in_order()) { - // The last command recorded in the graph is not tracked by the queue but by - // the graph itself. We must therefore search for the last node/event in the - // graph. - if (auto Graph = impl->getCommandGraph()) { - auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl)); - return sycl::detail::createSyclObjFromImpl(LastEvent); - } - return impl->getLastEvent(); - } + if (is_in_order()) + return getBarrierEventForInorderQueueHelper(impl); return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc); } @@ -238,20 +258,13 @@ event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) { /// group is being enqueued on. event queue::ext_oneapi_submit_barrier(const std::vector &WaitList, const detail::code_location &CodeLoc) { - bool AllEventsEmpty = std::all_of( + bool AllEventsEmptyOrNop = std::all_of( begin(WaitList), end(WaitList), [&](const event &Event) -> bool { - return !detail::getSyclObjImpl(Event)->isContextInitialized(); + return !detail::getSyclObjImpl(Event)->isContextInitialized() || + detail::getSyclObjImpl(Event)->isNOP(); }); - if (is_in_order() && AllEventsEmpty) { - // The last command recorded in the graph is not tracked by the queue but by - // the graph itself. We must therefore search for the last node/event in the - // graph. - if (auto Graph = impl->getCommandGraph()) { - auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl)); - return sycl::detail::createSyclObjFromImpl(LastEvent); - } - return impl->getLastEvent(); - } + if (is_in_order() && AllEventsEmptyOrNop) + return getBarrierEventForInorderQueueHelper(impl); return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); }, CodeLoc); diff --git a/sycl/test-e2e/Regression/nop_event_profiling.cpp b/sycl/test-e2e/Regression/nop_event_profiling.cpp new file mode 100644 index 0000000000000..38f2004da8ef4 --- /dev/null +++ b/sycl/test-e2e/Regression/nop_event_profiling.cpp @@ -0,0 +1,36 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test to check that it is possible to get profiling info from the event +// returned by barrier which turns into NOP. + +#include + +int main() { + sycl::event start; + sycl::event stop; + sycl::queue q{sycl::property_list(sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling())}; + float elapsed = 0; + + start = q.ext_oneapi_submit_barrier(); + std::cout << "before parallel_for" << std::endl; + q.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16), + sycl::range<3>(1, 1, 16)), + [=](sycl::nd_item<3> item_ct1) { + double d = 123; + for (int i = 0; i < 10000; i++) { + d = d * i; + } + }); + std::cout << "after parallel_for" << std::endl; + stop = q.ext_oneapi_submit_barrier(); + stop.wait_and_throw(); + elapsed = + (stop.get_profiling_info() - + start.get_profiling_info()) / + 1000000.0f; + std::cout << "elapsed:" << elapsed << std::endl; + return 0; +}