diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 05e2481de75bc..2be79be727369 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -154,11 +154,15 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } } -event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, - MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled}, - MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} { +event_impl::event_impl(const QueueImplPtr &Queue) { this->setContextImpl(Queue->getContextImplPtr()); + this->associateWithQueue(Queue); +} + +void event_impl::associateWithQueue(const QueueImplPtr &Queue) { + MQueue = Queue; + MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled; + MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback(); if (Queue->is_host()) { MState.store(HES_NotComplete); if (Queue->has_property()) { @@ -284,6 +288,7 @@ template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + // The delay between the submission and the actual start of a CommandBuffer // can be short. Consequently, the submission time, which is based on // an estimated clock and not on the real device clock, may be ahead of the @@ -312,6 +317,11 @@ template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + + // For nop command start time is equal to submission time. + if (isNOP() && MSubmitTime) + return MSubmitTime; + if (!MHostEvent) { if (MEvent) { auto StartTime = @@ -339,6 +349,11 @@ event_impl::get_profiling_info() { template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + + // For nop command end time is equal to submission time. + if (isNOP() && MSubmitTime) + return MSubmitTime; + if (!MHostEvent) { if (MEvent) { auto EndTime = diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index ed5b9ef07ef91..4a7467691127c 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -223,6 +223,17 @@ class event_impl { MSubmittedQueue = SubmittedQueue; }; + /// Associate event with provided queue. + /// + /// @return + void associateWithQueue(const QueueImplPtr &Queue); + + /// Indicates if this event is not associated with any command and doesn't + /// have native handle. + /// + /// @return true if no associated command and no event handle. + bool isNOP() { return !MCommand && !getHandleRef(); } + /// Calling this function queries the current device timestamp and sets it as /// submission time for the command associated with this event. void setSubmissionTime(); @@ -316,8 +327,8 @@ class event_impl { std::unique_ptr MHostProfilingInfo; void *MCommand = nullptr; std::weak_ptr MQueue; - const bool MIsProfilingEnabled = false; - const bool MFallbackProfiling = false; + bool MIsProfilingEnabled = false; + bool MFallbackProfiling = false; std::weak_ptr MWorkerQueue; std::weak_ptr MSubmittedQueue; diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 222b06127207d..1bdb2ddbd4697 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -31,8 +31,9 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - if (!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) { + if ((!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) || + SyclEventImplPtr->isNOP()) { continue; } // The fusion command and its event are associated with a non-host context, diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bc8c26ca88cb0..f819b35053258 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -290,8 +290,10 @@ areEventsSafeForSchedulerBypass(const std::vector &DepEvents, // Events that don't have an initialized context are throwaway events that // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. - if (!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) { + // NOP events also don't represent actual dependencies. + if ((!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) || + SyclEventImplPtr->isNOP()) { return true; } if (SyclEventImplPtr->is_host()) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b113da757bd0c..8777b82db1f6b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -263,8 +263,9 @@ std::vector Command::getPiEventsBlocking( // Throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - // Skip host task also. - if (!EventImpl->isContextInitialized() || EventImpl->is_host()) + // Skip host task and NOP events also. + if (!EventImpl->isContextInitialized() || EventImpl->is_host() || + EventImpl->isNOP()) continue; // In this path nullptr native event means that the command has not been // enqueued. It may happen if async enqueue in a host task is involved. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 3fee25b7236a2..acaecf2696629 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -205,6 +205,34 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { impl->wait_and_throw(CodeLoc); } +static event +getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) { + // The last command recorded in the graph is not tracked by the queue but by + // the graph itself. We must therefore search for the last node/event in the + // graph. + if (auto Graph = QueueImpl->getCommandGraph()) { + auto LastEvent = + Graph->getEventForNode(Graph->getLastInorderNode(QueueImpl)); + return sycl::detail::createSyclObjFromImpl(LastEvent); + } + auto LastEvent = QueueImpl->getLastEvent(); + if (QueueImpl->MDiscardEvents) { + std::cout << "Discard event enabled" << std::endl; + return LastEvent; + } + + auto LastEventImpl = detail::getSyclObjImpl(LastEvent); + // If last event is default constructed event then we want to associate it + // with the queue and record submission time if profiling is enabled. Such + // event corresponds to NOP and its submit time is same as start time and + // end time. + if (!LastEventImpl->isContextInitialized()) { + LastEventImpl->associateWithQueue(QueueImpl); + LastEventImpl->setSubmissionTime(); + } + return detail::createSyclObjFromImpl(LastEventImpl); +} + /// Prevents any commands submitted afterward to this queue from executing /// until all commands previously submitted to this queue have entered the /// complete state. @@ -213,16 +241,8 @@ void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { /// \return a SYCL event object, which corresponds to the queue the command /// group is being enqueued on. event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) { - if (is_in_order()) { - // The last command recorded in the graph is not tracked by the queue but by - // the graph itself. We must therefore search for the last node/event in the - // graph. - if (auto Graph = impl->getCommandGraph()) { - auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl)); - return sycl::detail::createSyclObjFromImpl(LastEvent); - } - return impl->getLastEvent(); - } + if (is_in_order()) + return getBarrierEventForInorderQueueHelper(impl); return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc); } @@ -238,20 +258,13 @@ event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) { /// group is being enqueued on. event queue::ext_oneapi_submit_barrier(const std::vector &WaitList, const detail::code_location &CodeLoc) { - bool AllEventsEmpty = std::all_of( + bool AllEventsEmptyOrNop = std::all_of( begin(WaitList), end(WaitList), [&](const event &Event) -> bool { - return !detail::getSyclObjImpl(Event)->isContextInitialized(); + return !detail::getSyclObjImpl(Event)->isContextInitialized() || + detail::getSyclObjImpl(Event)->isNOP(); }); - if (is_in_order() && AllEventsEmpty) { - // The last command recorded in the graph is not tracked by the queue but by - // the graph itself. We must therefore search for the last node/event in the - // graph. - if (auto Graph = impl->getCommandGraph()) { - auto LastEvent = Graph->getEventForNode(Graph->getLastInorderNode(impl)); - return sycl::detail::createSyclObjFromImpl(LastEvent); - } - return impl->getLastEvent(); - } + if (is_in_order() && AllEventsEmptyOrNop) + return getBarrierEventForInorderQueueHelper(impl); return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); }, CodeLoc); diff --git a/sycl/test-e2e/Regression/nop_event_profiling.cpp b/sycl/test-e2e/Regression/nop_event_profiling.cpp new file mode 100644 index 0000000000000..38f2004da8ef4 --- /dev/null +++ b/sycl/test-e2e/Regression/nop_event_profiling.cpp @@ -0,0 +1,36 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test to check that it is possible to get profiling info from the event +// returned by barrier which turns into NOP. + +#include + +int main() { + sycl::event start; + sycl::event stop; + sycl::queue q{sycl::property_list(sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling())}; + float elapsed = 0; + + start = q.ext_oneapi_submit_barrier(); + std::cout << "before parallel_for" << std::endl; + q.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16), + sycl::range<3>(1, 1, 16)), + [=](sycl::nd_item<3> item_ct1) { + double d = 123; + for (int i = 0; i < 10000; i++) { + d = d * i; + } + }); + std::cout << "after parallel_for" << std::endl; + stop = q.ext_oneapi_submit_barrier(); + stop.wait_and_throw(); + elapsed = + (stop.get_profiling_info() - + start.get_profiling_info()) / + 1000000.0f; + std::cout << "elapsed:" << elapsed << std::endl; + return 0; +}