diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index 739a0b3333acd..5932a7df7ec89 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -166,7 +166,12 @@ created on UR command-buffer enqueue.
There is also a *WaitEvent* used by the `ur_exp_command_buffer_handle_t` class
in the prefix to wait on any dependencies passed in the enqueue wait-list.
-![L0 command-buffer diagram](images/L0_UR_command-buffer.svg)
+If a command-buffer is about to be submitted to a queue with the profiling
+property enabled, a profiling *StartEvent* is added to the end of the prefix.
+This event is used to get the command-buffer execution start time on the
+device.
+
+![L0 command-buffer diagram](images/L0_UR_command-buffer-v2.jpg)
For a call to `urCommandBufferEnqueueExp` with an `event_list` *EL*,
command-buffer *CB*, and return event *RE* our implementation has to submit two
diff --git a/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg b/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg
new file mode 100644
index 0000000000000..c3b608006e499
Binary files /dev/null and b/sycl/doc/design/images/L0_UR_command-buffer-v2.jpg differ
diff --git a/sycl/doc/design/images/L0_UR_command-buffer.svg b/sycl/doc/design/images/L0_UR_command-buffer.svg
deleted file mode 100644
index 133fa17b9d711..0000000000000
--- a/sycl/doc/design/images/L0_UR_command-buffer.svg
+++ /dev/null
@@ -1,17 +0,0 @@
-
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp
index 9c55f67f3d6dc..f19a0f2400f61 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp
@@ -816,6 +816,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
ZE2UR_CALL(zeCommandListAppendBarrier,
(SignalCommandList->first, RetEvent->ZeEvent, 1,
&(CommandBuffer->SignalEvent->ZeEvent)));
+
+ if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+ // We create an additional signal specific to the current execution of the
+ // CommandBuffer. This signal is needed for profiling the execution time
+ // of the CommandBuffer. It waits for the WaitEvent to be signaled
+ // which indicates the start of the CommandBuffer actual execution.
+ // This event is embedded into the Event return to the user to allow
+ // the profiling engine to retrieve it.
+ ur_event_handle_t StartEvent{};
+ UR_CALL(createEventAndAssociateQueue(
+ Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
+ WaitCommandList, false));
+
+ ZE2UR_CALL(zeCommandListAppendBarrier,
+ (WaitCommandList->first, StartEvent->ZeEvent, 1,
+ &(CommandBuffer->WaitEvent->ZeEvent)));
+
+ RetEvent->CommandData = StartEvent;
+ }
}
// Execution our command-lists asynchronously
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp
index 411e0fce17370..1e6c83dfea436 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp
@@ -11,6 +11,7 @@
#include
#include
+#include "command_buffer.hpp"
#include "common.hpp"
#include "event.hpp"
#include "ur_level_zero.hpp"
@@ -446,6 +447,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
///< bytes returned in propValue
) {
std::shared_lock EventLock(Event->Mutex);
+
+ // A Command-buffer consists of three command-lists.
+ // The start time should therefore be taken from an event associated
+ // to the first command-list.
+ if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) &&
+ (PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) {
+ auto StartEvent = static_cast(Event->CommandData);
+ return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END,
+ PropValueSize, PropValue, PropValueSizeRet);
+ }
+
if (Event->UrQueue &&
(Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
@@ -755,6 +767,13 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
return Res;
Event->CommandData = nullptr;
}
+ if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
+ Event->CommandData) {
+ // Free the memory extra event allocated for profiling purposed.
+ auto AssociateEvent = static_cast(Event->CommandData);
+ urEventRelease(AssociateEvent);
+ Event->CommandData = nullptr;
+ }
if (Event->OwnNativeHandle) {
if (DisableEventsCaching) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 24edd3d6e9a1e..a82b0faf22a1a 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -155,8 +155,8 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
}
event_impl::event_impl(const QueueImplPtr &Queue)
- : MQueue{Queue},
- MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled},
+ : MQueue{Queue}, MIsProfilingEnabled{Queue->is_host() ||
+ Queue->MIsProfilingEnabled},
MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} {
this->setContextImpl(Queue->getContextImplPtr());
if (Queue->is_host()) {
@@ -278,17 +278,33 @@ void event_impl::checkProfilingPreconditions() const {
"Profiling information is unavailable as the queue associated with "
"the event does not have the 'enable_profiling' property.");
}
- if (MEventFromSubmitedExecCommandBuffer) {
- throw sycl::exception(make_error_code(sycl::errc::invalid),
- "Profiling information is unavailable for events "
- "returned by a graph submission.");
- }
}
template <>
uint64_t
event_impl::get_profiling_info() {
checkProfilingPreconditions();
+ // The delay between the submission and the actual start of a CommandBuffer
+ // can be short. Consequently, the submission time, which is based on
+ // an estimated clock and not on the real device clock, may be ahead of time
+ // than the start time, which is based on the actual device clock.
+ // MSubmitTime is set in a critical performance path.
+ // Force reading the device clock when setting MSubmitTime may deteriorate
+ // the performance.
+ // Since submit time is an estimated time, we implement this little hack
+ // that allows all profiled time to be meaningful.
+ // (Note that the observed time deviation between the estimated clock and
+ // the real device clock is typically less than 0.5ms. The approximation we
+ // make by forcing the re-sync of submit time to start time is less than
+ // 0.5ms. These timing values were obainted empirically using an integrated
+ // Intel GPU).
+ if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) {
+ uint64_t StartTime =
+ get_event_profiling_info(
+ this->getHandleRef(), this->getPlugin());
+ if (StartTime < MSubmitTime)
+ MSubmitTime = StartTime;
+ }
return MSubmitTime;
}
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 067218f5a8459..493af8f87cc6c 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -282,12 +282,12 @@ class event_impl {
return MGraph.lock();
}
- void setEventFromSubmitedExecCommandBuffer(bool value) {
- MEventFromSubmitedExecCommandBuffer = value;
+ void setEventFromSubmittedExecCommandBuffer(bool value) {
+ MEventFromSubmittedExecCommandBuffer = value;
}
- bool isEventFromSubmitedExecCommandBuffer() const {
- return MEventFromSubmitedExecCommandBuffer;
+ bool isEventFromSubmittedExecCommandBuffer() const {
+ return MEventFromSubmittedExecCommandBuffer;
}
protected:
@@ -341,7 +341,7 @@ class event_impl {
/// This event is also be stored in the graph so a weak_ptr is used.
std::weak_ptr MGraph;
/// Indicates that the event results from a command graph submission
- bool MEventFromSubmitedExecCommandBuffer = false;
+ bool MEventFromSubmittedExecCommandBuffer = false;
// If this event represents a submission to a
// sycl::detail::pi::PiExtCommandBuffer the sync point for that submission is
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 61b7895f6abd5..4a63d9879e878 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -615,7 +615,6 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue,
auto NewEvent = std::make_shared(Queue);
NewEvent->setContextImpl(Queue->getContextImplPtr());
NewEvent->setStateIncomplete();
- NewEvent->setEventFromSubmitedExecCommandBuffer(true);
return NewEvent;
});
@@ -667,6 +666,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue,
NewEvent = sycl::detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), Queue);
}
+ NewEvent->setEventFromSubmittedExecCommandBuffer(true);
} else {
std::vector> ScheduledEvents;
for (auto &NodeImpl : MSchedule) {
diff --git a/sycl/test-e2e/Graph/event_profiling_info.cpp b/sycl/test-e2e/Graph/event_profiling_info.cpp
new file mode 100644
index 0000000000000..b84fc64251e82
--- /dev/null
+++ b/sycl/test-e2e/Graph/event_profiling_info.cpp
@@ -0,0 +1,190 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out 2>&1
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// This test checks the profiling of an event returned
+// from graph submission with event::get_profiling_info().
+// It first tests a graph made exclusively of memory operations,
+// then tests a graph made of kernels.
+// The second run is to check that there are no leaks reported with the embedded
+// ZE_DEBUG=4 testing capability.
+
+#include "./graph_common.hpp"
+
+#define GRAPH_TESTS_VERBOSE_PRINT 0
+
+#if GRAPH_TESTS_VERBOSE_PRINT
+#include
+#endif
+
+bool verifyProfiling(event Event) {
+ auto Submit =
+ Event.get_profiling_info();
+ auto Start =
+ Event.get_profiling_info();
+ auto End =
+ Event.get_profiling_info();
+
+#if GRAPH_TESTS_VERBOSE_PRINT
+ std::cout << "Submit = " << Submit << std::endl;
+ std::cout << "Start = " << Start << std::endl;
+ std::cout << "End = " << End << " ( " << (End - Start) << " ) "
+ << " => full ( " << (End - Submit) << " ) " << std::endl;
+#endif
+
+ assert((Submit && Start && End) && "Profiling information failed.");
+ assert(Submit <= Start);
+ assert(Start < End);
+
+ bool Pass = sycl::info::event_command_status::complete ==
+ Event.get_info();
+
+ return Pass;
+}
+
+bool compareProfiling(event Event1, event Event2) {
+ assert(Event1 != Event2);
+
+ auto SubmitEvent1 =
+ Event1.get_profiling_info();
+ auto StartEvent1 =
+ Event1.get_profiling_info();
+ auto EndEvent1 =
+ Event1.get_profiling_info();
+ assert((SubmitEvent1 && StartEvent1 && EndEvent1) &&
+ "Profiling information failed.");
+
+ auto SubmitEvent2 =
+ Event2.get_profiling_info();
+ auto StartEvent2 =
+ Event2.get_profiling_info();
+ auto EndEvent2 =
+ Event2.get_profiling_info();
+ assert((SubmitEvent2 && StartEvent2 && EndEvent2) &&
+ "Profiling information failed.");
+
+ assert(SubmitEvent1 != SubmitEvent2);
+ assert(StartEvent1 != StartEvent2);
+ assert(EndEvent1 != EndEvent2);
+
+ bool Pass1 = sycl::info::event_command_status::complete ==
+ Event1.get_info();
+ bool Pass2 = sycl::info::event_command_status::complete ==
+ Event2.get_info();
+
+ return (Pass1 && Pass2);
+}
+
+// The test checks that get_profiling_info waits for command asccociated with
+// event to complete execution.
+int main() {
+ device Dev;
+ queue Queue{Dev, sycl::property::queue::enable_profiling()};
+
+ const size_t Size = 1000000;
+ int Data[Size] = {0};
+ for (size_t I = 0; I < Size; ++I) {
+ Data[I] = I;
+ }
+ int Values[Size] = {0};
+
+ buffer BufferFrom(Data, range<1>(Size));
+ buffer BufferTo(Values, range<1>(Size));
+
+ buffer BufferA(Data, range<1>(Size));
+ buffer BufferB(Values, range<1>(Size));
+ buffer BufferC(Values, range<1>(Size));
+
+ BufferFrom.set_write_back(false);
+ BufferTo.set_write_back(false);
+ BufferA.set_write_back(false);
+ BufferB.set_write_back(false);
+ BufferC.set_write_back(false);
+ { // buffer copy
+ exp_ext::command_graph CopyGraph{
+ Queue.get_context(),
+ Queue.get_device(),
+ {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+ CopyGraph.begin_recording(Queue);
+
+ Queue.submit([&](sycl::handler &Cgh) {
+ accessor AccessorFrom(
+ BufferFrom, Cgh, range<1>(Size));
+ accessor AccessorTo(
+ BufferTo, Cgh, range<1>(Size));
+ Cgh.copy(AccessorFrom, AccessorTo);
+ });
+
+ CopyGraph.end_recording(Queue);
+
+ // kernel launch
+ exp_ext::command_graph KernelGraph{
+ Queue.get_context(),
+ Queue.get_device(),
+ {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+ KernelGraph.begin_recording(Queue);
+
+ run_kernels(Queue, Size, BufferA, BufferB, BufferC);
+
+ KernelGraph.end_recording(Queue);
+
+ auto CopyGraphExec = CopyGraph.finalize();
+ auto KernelGraphExec = KernelGraph.finalize();
+
+ event CopyEvent, KernelEvent1, KernelEvent2;
+ // Run graphs
+#if GRAPH_TESTS_VERBOSE_PRINT
+ auto StartCopyGraph = std::chrono::high_resolution_clock::now();
+#endif
+ CopyEvent = Queue.submit(
+ [&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExec); });
+ Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+ auto EndCopyGraph = std::chrono::high_resolution_clock::now();
+ auto StartKernelSubmit1 = std::chrono::high_resolution_clock::now();
+#endif
+ KernelEvent1 = Queue.submit(
+ [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); });
+ Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+ auto endKernelSubmit1 = std::chrono::high_resolution_clock::now();
+ auto StartKernelSubmit2 = std::chrono::high_resolution_clock::now();
+#endif
+ KernelEvent2 = Queue.submit(
+ [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); });
+ Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+ auto endKernelSubmit2 = std::chrono::high_resolution_clock::now();
+
+ double DelayCopy = std::chrono::duration_cast(
+ EndCopyGraph - StartCopyGraph)
+ .count();
+ std::cout << "Copy Graph delay (in ns) : " << DelayCopy << std::endl;
+ double DelayKernel1 = std::chrono::duration_cast(
+ endKernelSubmit1 - StartKernelSubmit1)
+ .count();
+ std::cout << "Kernel 1st Execution delay (in ns) : " << DelayKernel1
+ << std::endl;
+ double DelayKernel2 = std::chrono::duration_cast(
+ endKernelSubmit2 - StartKernelSubmit2)
+ .count();
+ std::cout << "Kernel 2nd Execution delay (in ns) : " << DelayKernel2
+ << std::endl;
+#endif
+
+ // Checks profiling times
+ assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
+ verifyProfiling(KernelEvent2) &&
+ compareProfiling(KernelEvent1, KernelEvent2));
+ }
+
+ host_accessor HostData(BufferTo);
+ for (size_t I = 0; I < Size; ++I) {
+ assert(HostData[I] == Values[I]);
+ }
+
+ return 0;
+}
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index 9fd2b5db8db11..433f90c3b2dc0 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -1965,76 +1965,6 @@ TEST_F(CommandGraphTest, BindlessExceptionCheck) {
sycl::free(ImgMemUSM, Ctxt);
}
-TEST_F(CommandGraphTest, GetProfilingInfoExceptionCheck) {
- sycl::context Ctx{Dev};
- sycl::queue QueueProfile{
- Ctx, Dev, sycl::property_list{sycl::property::queue::enable_profiling{}}};
- experimental::command_graph
- GraphProfile{QueueProfile.get_context(), Dev};
-
- GraphProfile.begin_recording(QueueProfile);
- auto Event = QueueProfile.submit(
- [&](sycl::handler &cgh) { cgh.single_task>([]() {}); });
-
- // Checks that exception is thrown when get_profile_info is called on "event"
- // returned by a queue in recording mode.
- std::error_code ExceptionCode = make_error_code(sycl::errc::success);
- try {
- Event.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-
- ExceptionCode = make_error_code(sycl::errc::success);
- try {
- Event.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-
- ExceptionCode = make_error_code(sycl::errc::success);
- try {
- Event.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-
- GraphProfile.end_recording();
-
- auto GraphExec = GraphProfile.finalize();
- auto EventSub = QueueProfile.submit(
- [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
-
- // Checks that exception is thrown when get_profile_info is called on "event"
- // returned by a graph submission.
- ExceptionCode = make_error_code(sycl::errc::success);
- try {
- EventSub.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-
- ExceptionCode = make_error_code(sycl::errc::success);
- try {
- EventSub.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-
- ExceptionCode = make_error_code(sycl::errc::success);
- try {
- EventSub.get_profiling_info();
- } catch (exception &Exception) {
- ExceptionCode = Exception.code();
- }
- ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
-}
-
TEST_F(CommandGraphTest, MakeEdgeErrors) {
// Set up some nodes in the graph
auto NodeA = Graph.add(