[SYCL][Graph] Fix bug when host-task is submitted to in-order queue (#…

…322) * [SYCL][Graph] Fix bug when host-task is submitted to in-order queue When a host-task is submitted to in-order queue, dependency between this host-task and the successor is explicitly handled. However, when we record an in-order queue, the recorded CG are not part of the regular in-order queue execution sequence. But inter-CG dependancies are managed by the graph implementation. This PR implements this point and ensures that recording an in-order does not impact the normal execution sequence. Tests (e2e and unitest) have been added to check it. Adds and Renames tests (e2e and unitests)
reble · Sep 26, 2023 · 52a1117 · 52a1117
1 parent 7429d3f
commit 52a1117
Show file tree

Hide file tree

Showing 6 changed files with 499 additions and 6 deletions.
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
@@ -106,7 +106,11 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
  if (MContext->is_host())
  return MDiscardEvents ? createDiscardedEvent() : event();
 
- if (isInOrder()) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (isInOrder() && (getCommandGraph() == nullptr)) {
  MLastEvent = ResEvent;
  // We don't create a command group for usm commands, so set it to None.
  // This variable is used to perform explicit dependency management when
@@ -198,7 +202,11 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
  if (MContext->is_host())
  return MDiscardEvents ? createDiscardedEvent() : event();
 
- if (isInOrder()) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (isInOrder() && (getCommandGraph() == nullptr)) {
  MLastEvent = ResEvent;
  // We don't create a command group for usm commands, so set it to None.
  // This variable is used to perform explicit dependency management when
@@ -241,7 +249,11 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
  if (MContext->is_host())
  return MDiscardEvents ? createDiscardedEvent() : event();
 
- if (isInOrder()) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (isInOrder() && (getCommandGraph() == nullptr)) {
  MLastEvent = ResEvent;
  // We don't create a command group for usm commands, so set it to None.
  // This variable is used to perform explicit dependency management when
@@ -286,7 +298,11 @@ event queue_impl::memcpyToDeviceGlobal(
  if (MContext->is_host())
  return MDiscardEvents ? createDiscardedEvent() : event();
 
- if (isInOrder()) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (isInOrder() && (getCommandGraph() == nullptr)) {
  MLastEvent = ResEvent;
  // We don't create a command group for usm commands, so set it to None.
  // This variable is used to perform explicit dependency management when
@@ -331,7 +347,11 @@ event queue_impl::memcpyFromDeviceGlobal(
  if (MContext->is_host())
  return MDiscardEvents ? createDiscardedEvent() : event();
 
- if (isInOrder()) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (isInOrder() && (getCommandGraph() == nullptr)) {
  MLastEvent = ResEvent;
  // We don't create a command group for usm commands, so set it to None.
  // This variable is used to perform explicit dependency management when

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
@@ -698,7 +698,11 @@ class queue_impl {
  template <typename HandlerType = handler>
  void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,
  event &EventRet) {
- if (MIsInorder) {
+ // When a queue is recorded by a graph, the dependencies are managed in the
+ // graph implementaton. Additionally, CG recorded for a graph are outside of
+ // the in-order queue execution sequence. Therefore, these CG must not
+ // update MLastEvent.
+ if (MIsInorder && (getCommandGraph() == nullptr)) {
 
  auto IsExpDepManaged = [](const CG::CGTYPE &Type) {
  return Type == CG::CGTYPE::CodeplayHostTask;

diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies.cpp
@@ -0,0 +1,65 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting an host kernel to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+ using T = int;
+
+ queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+ sycl::property::queue::in_order{}}};
+
+ // Check if device has usm shared allocation
+ if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+ return 0;
+
+ T *TestData = sycl::malloc_shared<T>(Size, Queue);
+
+ ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+ Queue.get_device()};
+
+ Queue.submit([&](handler &CGH) {
+ CGH.host_task([=]() {
+ for (size_t i = 0; i < Size; i++) {
+ TestData[i] = static_cast<T>(i);
+ }
+ });
+ });
+
+ Graph.begin_recording(Queue);
+
+ auto GraphEvent = Queue.submit([&](handler &CGH) {
+ CGH.single_task<class TestKernel2>([=]() {
+ for (size_t i = 0; i < Size; i++) {
+ TestData[i] += static_cast<T>(i);
+ }
+ });
+ });
+
+ Graph.end_recording(Queue);
+
+ auto GraphExec = Graph.finalize();
+ Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+ Queue.submit([&](handler &CGH) {
+ CGH.single_task<class TestKernel3>([=]() {
+ for (size_t i = 0; i < Size; i++) {
+ TestData[i] *= static_cast<T>(i);
+ }
+ });
+ });
+
+ Queue.wait_and_throw();
+
+ for (size_t i = 0; i < Size; i++) {
+ assert(TestData[i] == ((i + i) * i));
+ }
+
+ sycl::free(TestData, Queue);
+
+ return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memcpy.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting memcpy to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+ using T = int;
+
+ queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+ sycl::property::queue::in_order{}}};
+
+ // Check if device has usm shared allocation
+ if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+ return 0;
+
+ std::vector<T> TestDataIn(Size);
+ T *TestData = sycl::malloc_shared<T>(Size, Queue);
+ T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);
+
+ ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+ Queue.get_device()};
+
+ std::memset(TestDataIn.data(), 1, Size * sizeof(T));
+ Queue.memcpy(TestData, TestDataIn.data(), Size * sizeof(T));
+
+ Graph.begin_recording(Queue);
+
+ auto GraphEvent = Queue.submit([&](handler &CGH) {
+ CGH.single_task<class TestKernel2>([=]() {
+ for (size_t i = 0; i < Size; i++) {
+ TestData[i] += static_cast<T>(i);
+ }
+ });
+ });
+
+ Graph.end_recording(Queue);
+
+ auto GraphExec = Graph.finalize();
+ Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+ Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));
+
+ Queue.wait_and_throw();
+
+ std::vector<T> Reference(Size);
+ std::memset(Reference.data(), 1, Size * sizeof(T));
+ for (size_t i = 0; i < Size; i++) {
+ Reference[i] += i;
+ }
+
+ // Check Outputs
+ for (size_t i = 0; i < Size; i++) {
+ assert(TestDataOut[i] == Reference[i]);
+ }
+
+ sycl::free(TestData, Queue);
+
+ return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/in_order_queue_with_host_managed_dependencies_memset.cpp
@@ -0,0 +1,61 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests submitting memset to an in-order queue before recording
+// commands from it.
+
+#include "../graph_common.hpp"
+
+int main() {
+ using T = int;
+
+ queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
+ sycl::property::queue::in_order{}}};
+
+ // Check if device has usm shared allocation
+ if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
+ return 0;
+
+ T *TestData = sycl::malloc_shared<T>(Size, Queue);
+ T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);
+
+ ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+ Queue.get_device()};
+
+ Queue.memset(TestData, 1, Size * sizeof(T));
+
+ Graph.begin_recording(Queue);
+
+ auto GraphEvent = Queue.submit([&](handler &CGH) {
+ CGH.single_task<class TestKernel2>([=]() {
+ for (size_t i = 0; i < Size; i++) {
+ TestData[i] += static_cast<T>(i);
+ }
+ });
+ });
+
+ Graph.end_recording(Queue);
+
+ auto GraphExec = Graph.finalize();
+ Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+ Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));
+
+ Queue.wait_and_throw();
+
+ std::vector<T> Reference(Size);
+ std::memset(Reference.data(), 1, Size * sizeof(T));
+ for (size_t i = 0; i < Size; i++) {
+ Reference[i] += i;
+ }
+
+ // Check Outputs
+ for (size_t i = 0; i < Size; i++) {
+ assert(TestDataOut[i] == Reference[i]);
+ }
+
+ sycl::free(TestData, Queue);
+
+ return 0;
+}