From 52a111796c8cc235b6a5744f34001133b36bf4d0 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Tue, 26 Sep 2023 14:19:55 +0100
Subject: [PATCH] [SYCL][Graph] Fix bug when host-task is submitted to in-order
 queue (#322)

* [SYCL][Graph] Fix bug when host-task is submitted to in-order queue

When a host-task is submitted to in-order queue, dependency between this host-task and the successor is explicitly handled.
However, when we record an in-order queue, the recorded CG are not part of the regular in-order queue execution sequence.
But inter-CG dependancies are managed by the graph implementation.
This PR implements this point and ensures that recording an in-order does not impact the normal execution sequence.
Tests (e2e and unitest) have been added to check it.

 Adds and Renames tests (e2e and unitests)
---
 sycl/source/detail/queue_impl.cpp             |  30 +-
 sycl/source/detail/queue_impl.hpp             |   6 +-
 ...r_queue_with_host_managed_dependencies.cpp |  65 ++++
 ..._with_host_managed_dependencies_memcpy.cpp |  63 ++++
 ..._with_host_managed_dependencies_memset.cpp |  61 ++++
 sycl/unittests/Extensions/CommandGraph.cpp    | 280 ++++++++++++++++++
 6 files changed, 499 insertions(+), 6 deletions(-)
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 4f3e1fabe7dfd..dfca3c32e5f0b 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -106,7 +106,11 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
     if (MContext->is_host())
       return MDiscardEvents ? createDiscardedEvent() : event();
 
-    if (isInOrder()) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (isInOrder() && (getCommandGraph() == nullptr)) {
       MLastEvent = ResEvent;
       // We don't create a command group for usm commands, so set it to None.
       // This variable is used to perform explicit dependency management when
@@ -198,7 +202,11 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
     if (MContext->is_host())
       return MDiscardEvents ? createDiscardedEvent() : event();
 
-    if (isInOrder()) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (isInOrder() && (getCommandGraph() == nullptr)) {
       MLastEvent = ResEvent;
       // We don't create a command group for usm commands, so set it to None.
       // This variable is used to perform explicit dependency management when
@@ -241,7 +249,11 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
     if (MContext->is_host())
       return MDiscardEvents ? createDiscardedEvent() : event();
 
-    if (isInOrder()) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (isInOrder() && (getCommandGraph() == nullptr)) {
       MLastEvent = ResEvent;
       // We don't create a command group for usm commands, so set it to None.
       // This variable is used to perform explicit dependency management when
@@ -286,7 +298,11 @@ event queue_impl::memcpyToDeviceGlobal(
     if (MContext->is_host())
       return MDiscardEvents ? createDiscardedEvent() : event();
 
-    if (isInOrder()) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (isInOrder() && (getCommandGraph() == nullptr)) {
       MLastEvent = ResEvent;
       // We don't create a command group for usm commands, so set it to None.
       // This variable is used to perform explicit dependency management when
@@ -331,7 +347,11 @@ event queue_impl::memcpyFromDeviceGlobal(
     if (MContext->is_host())
       return MDiscardEvents ? createDiscardedEvent() : event();
 
-    if (isInOrder()) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (isInOrder() && (getCommandGraph() == nullptr)) {
       MLastEvent = ResEvent;
       // We don't create a command group for usm commands, so set it to None.
       // This variable is used to perform explicit dependency management when
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 71b4a84d1249c..58858cde5247c 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -698,7 +698,11 @@ class queue_impl {
   template <typename HandlerType = handler>
   void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,
                        event &EventRet) {
-    if (MIsInorder) {
+    // When a queue is recorded by a graph, the dependencies are managed in the
+    // graph implementaton. Additionally, CG recorded for a graph are outside of
+    // the in-order queue execution sequence. Therefore, these CG must not
+    // update MLastEvent.
+    if (MIsInorder && (getCommandGraph() == nullptr)) {
 
       auto IsExpDepManaged = [](const CG::CGTYPE &Type) {
         return Type == CG::CGTYPE::CodeplayHostTask;
diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp
new file mode 100644
index 0000000000000..4fd6dac38d2d4
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp
@@ -0,0 +1,65 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting an host kernel to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+  using T = int;
+
+  queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+               sycl::property::queue::in_order{}}};
+
+  // Check if device has usm shared allocation
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+    return 0;
+
+  T *TestData = sycl::malloc_shared<T>(Size, Queue);
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  Queue.submit([&](handler &CGH) {
+    CGH.host_task([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        TestData[i] = static_cast<T>(i);
+      }
+    });
+  });
+
+  Graph.begin_recording(Queue);
+
+  auto GraphEvent = Queue.submit([&](handler &CGH) {
+    CGH.single_task<class TestKernel2>([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        TestData[i] += static_cast<T>(i);
+      }
+    });
+  });
+
+  Graph.end_recording(Queue);
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.single_task<class TestKernel3>([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        TestData[i] *= static_cast<T>(i);
+      }
+    });
+  });
+
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(TestData[i] == ((i + i) * i));
+  }
+
+  sycl::free(TestData, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp
new file mode 100644
index 0000000000000..6730c79d35aed
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting memcpy to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+  using T = int;
+
+  queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+               sycl::property::queue::in_order{}}};
+
+  // Check if device has usm shared allocation
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+    return 0;
+
+  std::vector<T> TestDataIn(Size);
+  T *TestData = sycl::malloc_shared<T>(Size, Queue);
+  T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  std::memset(TestDataIn.data(), 1, Size * sizeof(T));
+  Queue.memcpy(TestData, TestDataIn.data(), Size * sizeof(T));
+
+  Graph.begin_recording(Queue);
+
+  auto GraphEvent = Queue.submit([&](handler &CGH) {
+    CGH.single_task<class TestKernel2>([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        TestData[i] += static_cast<T>(i);
+      }
+    });
+  });
+
+  Graph.end_recording(Queue);
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+  Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));
+
+  Queue.wait_and_throw();
+
+  std::vector<T> Reference(Size);
+  std::memset(Reference.data(), 1, Size * sizeof(T));
+  for (size_t i = 0; i < Size; i++) {
+    Reference[i] += i;
+  }
+
+  // Check Outputs
+  for (size_t i = 0; i < Size; i++) {
+    assert(TestDataOut[i] == Reference[i]);
+  }
+
+  sycl::free(TestData, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp
new file mode 100644
index 0000000000000..42c6d12af8e42
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp
@@ -0,0 +1,61 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting memset to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+  using T = int;
+
+  queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+               sycl::property::queue::in_order{}}};
+
+  // Check if device has usm shared allocation
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+    return 0;
+
+  T *TestData = sycl::malloc_shared<T>(Size, Queue);
+  T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  Queue.memset(TestData, 1, Size * sizeof(T));
+
+  Graph.begin_recording(Queue);
+
+  auto GraphEvent = Queue.submit([&](handler &CGH) {
+    CGH.single_task<class TestKernel2>([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        TestData[i] += static_cast<T>(i);
+      }
+    });
+  });
+
+  Graph.end_recording(Queue);
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+  Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));
+
+  Queue.wait_and_throw();
+
+  std::vector<T> Reference(Size);
+  std::memset(Reference.data(), 1, Size * sizeof(T));
+  for (size_t i = 0; i < Size; i++) {
+    Reference[i] += i;
+  }
+
+  // Check Outputs
+  for (size_t i = 0; i < Size; i++) {
+    assert(TestDataOut[i] == Reference[i]);
+  }
+
+  sycl::free(TestData, Queue);
+
+  return 0;
+}
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index 26d7fb694da9a..9fd2b5db8db11 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -1306,6 +1306,286 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) {
   ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext());
 }
 
+TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  auto EventInitial =
+      InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); });
+  auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode1 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode1, nullptr);
+  ASSERT_TRUE(PtrNode1->MPredecessors.empty());
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode2 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode2, nullptr);
+  ASSERT_NE(PtrNode2, PtrNode1);
+  ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2);
+  ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode3 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode3, nullptr);
+  ASSERT_NE(PtrNode3, PtrNode2);
+  ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3);
+  ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  auto EventLast = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
+  auto WaitList = EventLastImpl->getWaitList();
+  // Previous task is an host task. Explicit dependency is needed to enfore the
+  // execution order
+  ASSERT_EQ(WaitList.size(), 1lu);
+  ASSERT_EQ(WaitList[0], EventInitialImpl);
+}
+
+TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  auto EventInitial =
+      InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); });
+  auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode1 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode1, nullptr);
+  ASSERT_TRUE(PtrNode1->MPredecessors.empty());
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode2 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode2, nullptr);
+  ASSERT_NE(PtrNode2, PtrNode1);
+  ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2);
+  ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode3 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode3, nullptr);
+  ASSERT_NE(PtrNode3, PtrNode2);
+  ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3);
+  ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  auto InOrderGraphExec = InOrderGraph.finalize();
+  auto EventGraph = InOrderQueue.submit(
+      [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
+
+  auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
+  auto EventGraphWaitList = EventGraphImpl->getWaitList();
+  // Previous task is an host task. Explicit dependency is needed to enfore the
+  // execution order
+  ASSERT_EQ(EventGraphWaitList.size(), 1lu);
+  ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);
+
+  auto EventLast = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+  auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
+  auto EventLastWaitList = EventLastImpl->getWaitList();
+  // Previous task is not an host task. In Order queue dependency are managed by
+  // the backend for non-host kernels
+  ASSERT_EQ(EventLastWaitList.size(), 0lu);
+}
+
+TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  // Check if device has usm shared allocation
+  if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations))
+    return;
+  size_t Size = 128;
+  std::vector<int> TestDataHost(Size);
+  int *TestData = sycl::malloc_shared<int>(Size, InOrderQueue);
+
+  auto EventInitial = InOrderQueue.memset(TestData, 1, Size * sizeof(int));
+  auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode1 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode1, nullptr);
+  ASSERT_TRUE(PtrNode1->MPredecessors.empty());
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode2 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode2, nullptr);
+  ASSERT_NE(PtrNode2, PtrNode1);
+  ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2);
+  ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode3 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode3, nullptr);
+  ASSERT_NE(PtrNode3, PtrNode2);
+  ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3);
+  ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  auto InOrderGraphExec = InOrderGraph.finalize();
+  auto EventGraph = InOrderQueue.submit(
+      [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
+
+  auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
+  auto EventGraphWaitList = EventGraphImpl->getWaitList();
+  // Previous task is an host task. Explicit dependency is needed to enfore the
+  // execution order
+  ASSERT_EQ(EventGraphWaitList.size(), 1lu);
+  ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);
+
+  auto EventLast =
+      InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int));
+  auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
+  auto EventLastWaitList = EventLastImpl->getWaitList();
+  // Previous task is not an host task. In Order queue dependency are managed by
+  // the backend for non-host kernels
+  ASSERT_EQ(EventLastWaitList.size(), 0lu);
+}
+
+TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  // Check if device has usm shared allocation
+  if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations))
+    return;
+  size_t Size = 128;
+  std::vector<int> TestDataHost(Size);
+  int *TestData = sycl::malloc_shared<int>(Size, InOrderQueue);
+
+  auto EventInitial =
+      InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int));
+  auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode1 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode1, nullptr);
+  ASSERT_TRUE(PtrNode1->MPredecessors.empty());
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode2 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode2, nullptr);
+  ASSERT_NE(PtrNode2, PtrNode1);
+  ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2);
+  ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto PtrNode3 =
+      sycl::detail::getSyclObjImpl(InOrderGraph)
+          ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
+  ASSERT_NE(PtrNode3, nullptr);
+  ASSERT_NE(PtrNode3, PtrNode2);
+  ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
+  ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3);
+  ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
+  ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  auto InOrderGraphExec = InOrderGraph.finalize();
+  auto EventGraph = InOrderQueue.submit(
+      [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
+
+  auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
+  auto EventGraphWaitList = EventGraphImpl->getWaitList();
+  // Previous task is an host task. Explicit dependency is needed to enfore the
+  // execution order
+  ASSERT_EQ(EventGraphWaitList.size(), 1lu);
+  ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);
+
+  auto EventLast =
+      InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int));
+  auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
+  auto EventLastWaitList = EventLastImpl->getWaitList();
+  // Previous task is not an host task. In Order queue dependency are managed by
+  // the backend for non-host kernels
+  ASSERT_EQ(EventLastWaitList.size(), 0lu);
+}
+
 TEST_F(CommandGraphTest, ExplicitBarrierException) {
 
   std::error_code ExceptionCode = make_error_code(sycl::errc::success);