From 52a111796c8cc235b6a5744f34001133b36bf4d0 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Tue, 26 Sep 2023 14:19:55 +0100 Subject: [PATCH] [SYCL][Graph] Fix bug when host-task is submitted to in-order queue (#322) * [SYCL][Graph] Fix bug when host-task is submitted to in-order queue When a host-task is submitted to in-order queue, dependency between this host-task and the successor is explicitly handled. However, when we record an in-order queue, the recorded CG are not part of the regular in-order queue execution sequence. But inter-CG dependancies are managed by the graph implementation. This PR implements this point and ensures that recording an in-order does not impact the normal execution sequence. Tests (e2e and unitest) have been added to check it. Adds and Renames tests (e2e and unitests) --- sycl/source/detail/queue_impl.cpp | 30 +- sycl/source/detail/queue_impl.hpp | 6 +- ...r_queue_with_host_managed_dependencies.cpp | 65 ++++ ..._with_host_managed_dependencies_memcpy.cpp | 63 ++++ ..._with_host_managed_dependencies_memset.cpp | 61 ++++ sycl/unittests/Extensions/CommandGraph.cpp | 280 ++++++++++++++++++ 6 files changed, 499 insertions(+), 6 deletions(-) create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4f3e1fabe7dfd..dfca3c32e5f0b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -106,7 +106,11 @@ event queue_impl::memset(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (isInOrder() && (getCommandGraph() == nullptr)) { MLastEvent = ResEvent; // We don't create a command group for usm commands, so set it to None. // This variable is used to perform explicit dependency management when @@ -198,7 +202,11 @@ event queue_impl::memcpy(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (isInOrder() && (getCommandGraph() == nullptr)) { MLastEvent = ResEvent; // We don't create a command group for usm commands, so set it to None. // This variable is used to perform explicit dependency management when @@ -241,7 +249,11 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (isInOrder() && (getCommandGraph() == nullptr)) { MLastEvent = ResEvent; // We don't create a command group for usm commands, so set it to None. // This variable is used to perform explicit dependency management when @@ -286,7 +298,11 @@ event queue_impl::memcpyToDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (isInOrder() && (getCommandGraph() == nullptr)) { MLastEvent = ResEvent; // We don't create a command group for usm commands, so set it to None. // This variable is used to perform explicit dependency management when @@ -331,7 +347,11 @@ event queue_impl::memcpyFromDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (isInOrder() && (getCommandGraph() == nullptr)) { MLastEvent = ResEvent; // We don't create a command group for usm commands, so set it to None. // This variable is used to perform explicit dependency management when diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 71b4a84d1249c..58858cde5247c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -698,7 +698,11 @@ class queue_impl { template void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type, event &EventRet) { - if (MIsInorder) { + // When a queue is recorded by a graph, the dependencies are managed in the + // graph implementaton. Additionally, CG recorded for a graph are outside of + // the in-order queue execution sequence. Therefore, these CG must not + // update MLastEvent. + if (MIsInorder && (getCommandGraph() == nullptr)) { auto IsExpDepManaged = [](const CG::CGTYPE &Type) { return Type == CG::CGTYPE::CodeplayHostTask; diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp new file mode 100644 index 0000000000000..4fd6dac38d2d4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp @@ -0,0 +1,65 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests submitting an host kernel to an in-order queue before recording +// commands from it. + +#include "../graph_common.hpp" + +int main() { + using T = int; + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}, + sycl::property::queue::in_order{}}}; + + // Check if device has usm shared allocation + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) + return 0; + + T *TestData = sycl::malloc_shared(Size, Queue); + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + Queue.submit([&](handler &CGH) { + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + TestData[i] = static_cast(i); + } + }); + }); + + Graph.begin_recording(Queue); + + auto GraphEvent = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + TestData[i] += static_cast(i); + } + }); + }); + + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + TestData[i] *= static_cast(i); + } + }); + }); + + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + assert(TestData[i] == ((i + i) * i)); + } + + sycl::free(TestData, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp new file mode 100644 index 0000000000000..6730c79d35aed --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp @@ -0,0 +1,63 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests submitting memcpy to an in-order queue before recording +// commands from it. + +#include "../graph_common.hpp" + +int main() { + using T = int; + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}, + sycl::property::queue::in_order{}}}; + + // Check if device has usm shared allocation + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) + return 0; + + std::vector TestDataIn(Size); + T *TestData = sycl::malloc_shared(Size, Queue); + T *TestDataOut = sycl::malloc_shared(Size, Queue); + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + std::memset(TestDataIn.data(), 1, Size * sizeof(T)); + Queue.memcpy(TestData, TestDataIn.data(), Size * sizeof(T)); + + Graph.begin_recording(Queue); + + auto GraphEvent = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + TestData[i] += static_cast(i); + } + }); + }); + + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + Queue.memcpy(TestDataOut, TestData, Size * sizeof(T)); + + Queue.wait_and_throw(); + + std::vector Reference(Size); + std::memset(Reference.data(), 1, Size * sizeof(T)); + for (size_t i = 0; i < Size; i++) { + Reference[i] += i; + } + + // Check Outputs + for (size_t i = 0; i < Size; i++) { + assert(TestDataOut[i] == Reference[i]); + } + + sycl::free(TestData, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp new file mode 100644 index 0000000000000..42c6d12af8e42 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp @@ -0,0 +1,61 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests submitting memset to an in-order queue before recording +// commands from it. + +#include "../graph_common.hpp" + +int main() { + using T = int; + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}, + sycl::property::queue::in_order{}}}; + + // Check if device has usm shared allocation + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) + return 0; + + T *TestData = sycl::malloc_shared(Size, Queue); + T *TestDataOut = sycl::malloc_shared(Size, Queue); + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + Queue.memset(TestData, 1, Size * sizeof(T)); + + Graph.begin_recording(Queue); + + auto GraphEvent = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + TestData[i] += static_cast(i); + } + }); + }); + + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + Queue.memcpy(TestDataOut, TestData, Size * sizeof(T)); + + Queue.wait_and_throw(); + + std::vector Reference(Size); + std::memset(Reference.data(), 1, Size * sizeof(T)); + for (size_t i = 0; i < Size; i++) { + Reference[i] += i; + } + + // Check Outputs + for (size_t i = 0; i < Size; i++) { + assert(TestDataOut[i] == Reference[i]); + } + + sycl::free(TestData, Queue); + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 26d7fb694da9a..9fd2b5db8db11 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1306,6 +1306,286 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); } +TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + auto EventInitial = + InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); }); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto EventLast = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto WaitList = EventLastImpl->getWaitList(); + // Previous task is an host task. Explicit dependency is needed to enfore the + // execution order + ASSERT_EQ(WaitList.size(), 1lu); + ASSERT_EQ(WaitList[0], EventInitialImpl); +} + +TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + auto EventInitial = + InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); }); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is an host task. Explicit dependency is needed to enfore the + // execution order + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not an host task. In Order queue dependency are managed by + // the backend for non-host kernels + ASSERT_EQ(EventLastWaitList.size(), 0lu); +} + +TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Check if device has usm shared allocation + if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations)) + return; + size_t Size = 128; + std::vector TestDataHost(Size); + int *TestData = sycl::malloc_shared(Size, InOrderQueue); + + auto EventInitial = InOrderQueue.memset(TestData, 1, Size * sizeof(int)); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is an host task. Explicit dependency is needed to enfore the + // execution order + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not an host task. In Order queue dependency are managed by + // the backend for non-host kernels + ASSERT_EQ(EventLastWaitList.size(), 0lu); +} + +TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Check if device has usm shared allocation + if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations)) + return; + size_t Size = 128; + std::vector TestDataHost(Size); + int *TestData = sycl::malloc_shared(Size, InOrderQueue); + + auto EventInitial = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is an host task. Explicit dependency is needed to enfore the + // execution order + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not an host task. In Order queue dependency are managed by + // the backend for non-host kernels + ASSERT_EQ(EventLastWaitList.size(), 0lu); +} + TEST_F(CommandGraphTest, ExplicitBarrierException) { std::error_code ExceptionCode = make_error_code(sycl::errc::success);