diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 739a0b3333acd..5932a7df7ec89 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -166,7 +166,12 @@ created on UR command-buffer enqueue. There is also a *WaitEvent* used by the `ur_exp_command_buffer_handle_t` class in the prefix to wait on any dependencies passed in the enqueue wait-list. -![L0 command-buffer diagram](images/L0_UR_command-buffer.svg) +If a command-buffer is about to be submitted to a queue with the profiling +property enabled, a profiling *StartEvent* is added to the end of the prefix. +This event is used to get the command-buffer execution start time on the +device. + +![L0 command-buffer diagram](images/L0_UR_command-buffer-v2.jpg) For a call to `urCommandBufferEnqueueExp` with an `event_list` *EL*, command-buffer *CB*, and return event *RE* our implementation has to submit two diff --git a/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg b/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg new file mode 100644 index 0000000000000..c3b608006e499 Binary files /dev/null and b/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg differ diff --git a/sycl/doc/design/images/L0_UR_command-buffer.svg b/sycl/doc/design/images/L0_UR_command-buffer.svg deleted file mode 100644 index 133fa17b9d711..0000000000000 --- a/sycl/doc/design/images/L0_UR_command-buffer.svg +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - PrefixSuffixCommands added to UR command-buffer by userPrefix commandsReset SignalEventBarrier waiting on WaitEventSuffix commandsSignal the UR command-buffer SignalEvent \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp index 9c55f67f3d6dc..f19a0f2400f61 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp @@ -816,6 +816,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeCommandListAppendBarrier, (SignalCommandList->first, RetEvent->ZeEvent, 1, &(CommandBuffer->SignalEvent->ZeEvent))); + + if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + // We create an additional signal specific to the current execution of the + // CommandBuffer. This signal is needed for profiling the execution time + // of the CommandBuffer. It waits for the WaitEvent to be signaled + // which indicates the start of the CommandBuffer actual execution. + // This event is embedded into the Event return to the user to allow + // the profiling engine to retrieve it. + ur_event_handle_t StartEvent{}; + UR_CALL(createEventAndAssociateQueue( + Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + WaitCommandList, false)); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (WaitCommandList->first, StartEvent->ZeEvent, 1, + &(CommandBuffer->WaitEvent->ZeEvent))); + + RetEvent->CommandData = StartEvent; + } } // Execution our command-lists asynchronously diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp index 411e0fce17370..1e6c83dfea436 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp @@ -11,6 +11,7 @@ #include #include +#include "command_buffer.hpp" #include "common.hpp" #include "event.hpp" #include "ur_level_zero.hpp" @@ -446,6 +447,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ///< bytes returned in propValue ) { std::shared_lock EventLock(Event->Mutex); + + // A Command-buffer consists of three command-lists. + // The start time should therefore be taken from an event associated + // to the first command-list. + if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) && + (PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) { + auto StartEvent = static_cast(Event->CommandData); + return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END, + PropValueSize, PropValue, PropValueSizeRet); + } + if (Event->UrQueue && (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; @@ -755,6 +767,13 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { return Res; Event->CommandData = nullptr; } + if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP && + Event->CommandData) { + // Free the memory extra event allocated for profiling purposed. + auto AssociateEvent = static_cast(Event->CommandData); + urEventRelease(AssociateEvent); + Event->CommandData = nullptr; + } if (Event->OwnNativeHandle) { if (DisableEventsCaching) { auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 24edd3d6e9a1e..a82b0faf22a1a 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -155,8 +155,8 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, - MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled}, + : MQueue{Queue}, MIsProfilingEnabled{Queue->is_host() || + Queue->MIsProfilingEnabled}, MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} { this->setContextImpl(Queue->getContextImplPtr()); if (Queue->is_host()) { @@ -278,17 +278,33 @@ void event_impl::checkProfilingPreconditions() const { "Profiling information is unavailable as the queue associated with " "the event does not have the 'enable_profiling' property."); } - if (MEventFromSubmitedExecCommandBuffer) { - throw sycl::exception(make_error_code(sycl::errc::invalid), - "Profiling information is unavailable for events " - "returned by a graph submission."); - } } template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); + // The delay between the submission and the actual start of a CommandBuffer + // can be short. Consequently, the submission time, which is based on + // an estimated clock and not on the real device clock, may be ahead of time + // than the start time, which is based on the actual device clock. + // MSubmitTime is set in a critical performance path. + // Force reading the device clock when setting MSubmitTime may deteriorate + // the performance. + // Since submit time is an estimated time, we implement this little hack + // that allows all profiled time to be meaningful. + // (Note that the observed time deviation between the estimated clock and + // the real device clock is typically less than 0.5ms. The approximation we + // make by forcing the re-sync of submit time to start time is less than + // 0.5ms. These timing values were obainted empirically using an integrated + // Intel GPU). + if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) { + uint64_t StartTime = + get_event_profiling_info( + this->getHandleRef(), this->getPlugin()); + if (StartTime < MSubmitTime) + MSubmitTime = StartTime; + } return MSubmitTime; } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 067218f5a8459..493af8f87cc6c 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -282,12 +282,12 @@ class event_impl { return MGraph.lock(); } - void setEventFromSubmitedExecCommandBuffer(bool value) { - MEventFromSubmitedExecCommandBuffer = value; + void setEventFromSubmittedExecCommandBuffer(bool value) { + MEventFromSubmittedExecCommandBuffer = value; } - bool isEventFromSubmitedExecCommandBuffer() const { - return MEventFromSubmitedExecCommandBuffer; + bool isEventFromSubmittedExecCommandBuffer() const { + return MEventFromSubmittedExecCommandBuffer; } protected: @@ -341,7 +341,7 @@ class event_impl { /// This event is also be stored in the graph so a weak_ptr is used. std::weak_ptr MGraph; /// Indicates that the event results from a command graph submission - bool MEventFromSubmitedExecCommandBuffer = false; + bool MEventFromSubmittedExecCommandBuffer = false; // If this event represents a submission to a // sycl::detail::pi::PiExtCommandBuffer the sync point for that submission is diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 61b7895f6abd5..4a63d9879e878 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -615,7 +615,6 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, auto NewEvent = std::make_shared(Queue); NewEvent->setContextImpl(Queue->getContextImplPtr()); NewEvent->setStateIncomplete(); - NewEvent->setEventFromSubmitedExecCommandBuffer(true); return NewEvent; }); @@ -667,6 +666,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, NewEvent = sycl::detail::Scheduler::getInstance().addCG( std::move(CommandGroup), Queue); } + NewEvent->setEventFromSubmittedExecCommandBuffer(true); } else { std::vector> ScheduledEvents; for (auto &NodeImpl : MSchedule) { diff --git a/sycl/test-e2e/Graph/event_profiling_info.cpp b/sycl/test-e2e/Graph/event_profiling_info.cpp new file mode 100644 index 0000000000000..b84fc64251e82 --- /dev/null +++ b/sycl/test-e2e/Graph/event_profiling_info.cpp @@ -0,0 +1,190 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out 2>&1 +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// This test checks the profiling of an event returned +// from graph submission with event::get_profiling_info(). +// It first tests a graph made exclusively of memory operations, +// then tests a graph made of kernels. +// The second run is to check that there are no leaks reported with the embedded +// ZE_DEBUG=4 testing capability. + +#include "./graph_common.hpp" + +#define GRAPH_TESTS_VERBOSE_PRINT 0 + +#if GRAPH_TESTS_VERBOSE_PRINT +#include +#endif + +bool verifyProfiling(event Event) { + auto Submit = + Event.get_profiling_info(); + auto Start = + Event.get_profiling_info(); + auto End = + Event.get_profiling_info(); + +#if GRAPH_TESTS_VERBOSE_PRINT + std::cout << "Submit = " << Submit << std::endl; + std::cout << "Start = " << Start << std::endl; + std::cout << "End = " << End << " ( " << (End - Start) << " ) " + << " => full ( " << (End - Submit) << " ) " << std::endl; +#endif + + assert((Submit && Start && End) && "Profiling information failed."); + assert(Submit <= Start); + assert(Start < End); + + bool Pass = sycl::info::event_command_status::complete == + Event.get_info(); + + return Pass; +} + +bool compareProfiling(event Event1, event Event2) { + assert(Event1 != Event2); + + auto SubmitEvent1 = + Event1.get_profiling_info(); + auto StartEvent1 = + Event1.get_profiling_info(); + auto EndEvent1 = + Event1.get_profiling_info(); + assert((SubmitEvent1 && StartEvent1 && EndEvent1) && + "Profiling information failed."); + + auto SubmitEvent2 = + Event2.get_profiling_info(); + auto StartEvent2 = + Event2.get_profiling_info(); + auto EndEvent2 = + Event2.get_profiling_info(); + assert((SubmitEvent2 && StartEvent2 && EndEvent2) && + "Profiling information failed."); + + assert(SubmitEvent1 != SubmitEvent2); + assert(StartEvent1 != StartEvent2); + assert(EndEvent1 != EndEvent2); + + bool Pass1 = sycl::info::event_command_status::complete == + Event1.get_info(); + bool Pass2 = sycl::info::event_command_status::complete == + Event2.get_info(); + + return (Pass1 && Pass2); +} + +// The test checks that get_profiling_info waits for command asccociated with +// event to complete execution. +int main() { + device Dev; + queue Queue{Dev, sycl::property::queue::enable_profiling()}; + + const size_t Size = 1000000; + int Data[Size] = {0}; + for (size_t I = 0; I < Size; ++I) { + Data[I] = I; + } + int Values[Size] = {0}; + + buffer BufferFrom(Data, range<1>(Size)); + buffer BufferTo(Values, range<1>(Size)); + + buffer BufferA(Data, range<1>(Size)); + buffer BufferB(Values, range<1>(Size)); + buffer BufferC(Values, range<1>(Size)); + + BufferFrom.set_write_back(false); + BufferTo.set_write_back(false); + BufferA.set_write_back(false); + BufferB.set_write_back(false); + BufferC.set_write_back(false); + { // buffer copy + exp_ext::command_graph CopyGraph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + CopyGraph.begin_recording(Queue); + + Queue.submit([&](sycl::handler &Cgh) { + accessor AccessorFrom( + BufferFrom, Cgh, range<1>(Size)); + accessor AccessorTo( + BufferTo, Cgh, range<1>(Size)); + Cgh.copy(AccessorFrom, AccessorTo); + }); + + CopyGraph.end_recording(Queue); + + // kernel launch + exp_ext::command_graph KernelGraph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + KernelGraph.begin_recording(Queue); + + run_kernels(Queue, Size, BufferA, BufferB, BufferC); + + KernelGraph.end_recording(Queue); + + auto CopyGraphExec = CopyGraph.finalize(); + auto KernelGraphExec = KernelGraph.finalize(); + + event CopyEvent, KernelEvent1, KernelEvent2; + // Run graphs +#if GRAPH_TESTS_VERBOSE_PRINT + auto StartCopyGraph = std::chrono::high_resolution_clock::now(); +#endif + CopyEvent = Queue.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExec); }); + Queue.wait_and_throw(); +#if GRAPH_TESTS_VERBOSE_PRINT + auto EndCopyGraph = std::chrono::high_resolution_clock::now(); + auto StartKernelSubmit1 = std::chrono::high_resolution_clock::now(); +#endif + KernelEvent1 = Queue.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); }); + Queue.wait_and_throw(); +#if GRAPH_TESTS_VERBOSE_PRINT + auto endKernelSubmit1 = std::chrono::high_resolution_clock::now(); + auto StartKernelSubmit2 = std::chrono::high_resolution_clock::now(); +#endif + KernelEvent2 = Queue.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); }); + Queue.wait_and_throw(); +#if GRAPH_TESTS_VERBOSE_PRINT + auto endKernelSubmit2 = std::chrono::high_resolution_clock::now(); + + double DelayCopy = std::chrono::duration_cast( + EndCopyGraph - StartCopyGraph) + .count(); + std::cout << "Copy Graph delay (in ns) : " << DelayCopy << std::endl; + double DelayKernel1 = std::chrono::duration_cast( + endKernelSubmit1 - StartKernelSubmit1) + .count(); + std::cout << "Kernel 1st Execution delay (in ns) : " << DelayKernel1 + << std::endl; + double DelayKernel2 = std::chrono::duration_cast( + endKernelSubmit2 - StartKernelSubmit2) + .count(); + std::cout << "Kernel 2nd Execution delay (in ns) : " << DelayKernel2 + << std::endl; +#endif + + // Checks profiling times + assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) && + verifyProfiling(KernelEvent2) && + compareProfiling(KernelEvent1, KernelEvent2)); + } + + host_accessor HostData(BufferTo); + for (size_t I = 0; I < Size; ++I) { + assert(HostData[I] == Values[I]); + } + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 9fd2b5db8db11..433f90c3b2dc0 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1965,76 +1965,6 @@ TEST_F(CommandGraphTest, BindlessExceptionCheck) { sycl::free(ImgMemUSM, Ctxt); } -TEST_F(CommandGraphTest, GetProfilingInfoExceptionCheck) { - sycl::context Ctx{Dev}; - sycl::queue QueueProfile{ - Ctx, Dev, sycl::property_list{sycl::property::queue::enable_profiling{}}}; - experimental::command_graph - GraphProfile{QueueProfile.get_context(), Dev}; - - GraphProfile.begin_recording(QueueProfile); - auto Event = QueueProfile.submit( - [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - - // Checks that exception is thrown when get_profile_info is called on "event" - // returned by a queue in recording mode. - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - Event.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - ExceptionCode = make_error_code(sycl::errc::success); - try { - Event.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - ExceptionCode = make_error_code(sycl::errc::success); - try { - Event.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - GraphProfile.end_recording(); - - auto GraphExec = GraphProfile.finalize(); - auto EventSub = QueueProfile.submit( - [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - - // Checks that exception is thrown when get_profile_info is called on "event" - // returned by a graph submission. - ExceptionCode = make_error_code(sycl::errc::success); - try { - EventSub.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - ExceptionCode = make_error_code(sycl::errc::success); - try { - EventSub.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - ExceptionCode = make_error_code(sycl::errc::success); - try { - EventSub.get_profiling_info(); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); -} - TEST_F(CommandGraphTest, MakeEdgeErrors) { // Set up some nodes in the graph auto NodeA = Graph.add(