diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 9b57dd51c0a9a..d45a53b65d136 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -219,6 +219,13 @@ Level Zero: `waitForEvents` on the same command-list. Resulting in additional latency when executing a UR command-buffer. +3. Dependencies between multiple submissions must be handled by the runtime. + Indeed, when a second submission is performed the signal conditions + of *WaitEvent* are redefined by this second submission. + Therefore, this can lead to an undefined behavior and potential + hangs especially if the conditions of the first submissions were not yet + satisfied and the event has not yet been signaled. + Future work will include exploring L0 API extensions to improve the mapping of UR command-buffer to L0 command-list. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 1dd5537a496f8..303af1f0da6c5 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1059,9 +1059,10 @@ void handler::ext_oneapi_graph(command_graph& graph) ---- -|Invokes the execution of a graph. Only one instance of `graph` may be executing, -or pending execution, at any time. Concurrent graph execution can be achieved by -finalizing a graph in modifiable state into multiple graphs in executable state. +|Invokes the execution of a graph. Only one instance of `graph` will +execute at any time. If `graph` is submitted multiple times, dependencies +are automatically added by the runtime to prevent concurrent executions of +an identical graph. Parameters: @@ -1073,8 +1074,6 @@ Exceptions: to a queue which is associated with a device or context that is different from the device and context used on creation of the graph. -* Throws synchronously with error code `invalid` if a previous submission of - `graph` has yet to complete execution. |=== === Thread Safety @@ -1600,6 +1599,12 @@ outputs of the modifiable graph, a technique called _Whole Graph Update_. The modifiable graph must have the same topology as the graph originally used to create the executable graphs, with the nodes targeting the same devices and added in the same order. +If a graph has been updated since its last submission, the sequential +execution constraint is no longer required. +The automatic addition of dependencies is disabled and updated graphs +can be submitted simultaneously. +Users are therefore responsible for explicitly managing potential dependencies +between these executions to avoid data races. :sycl-kernel-function: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sycl-kernel-function @@ -1729,15 +1734,6 @@ runtime. == Issues -=== Simultaneous Graph Submission - -Enable an instance of a graph in executable state to be submitted for execution -when a previous submission of the same graph has yet to complete execution. - -**UNRESOLVED:** Trending "yes". Backend support for this is inconsistent, but -the runtime could schedule the submissions sequentially for backends which don't -support it. - === Multi Device Graph Allow an executable graph to contain nodes targeting different devices. @@ -1802,6 +1798,9 @@ if used in application code. `sycl::ext::intel::property::queue::no_immediate_command_list` should be set on construction to any queues an executable graph is submitted to. +. Synchronization between multiple executions of the same command-buffer + must be handled in the host for level-zero backend, which may involve + extra latency for subsequent submissions. == Revision History diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 168d1bc83f253..b30c72510478f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -596,11 +596,40 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, sycl::detail::EventImplPtr NewEvent; if (CommandBuffer) { - if (!previousSubmissionCompleted()) { - throw sycl::exception(make_error_code(errc::invalid), - "This Graph cannot be submitted at the moment " - "because the previous run has not yet completed."); + // if previous submissions are incompleted, we automatically + // add completion events of previous submissions as dependencies. + // With Level-Zero backend we cannot resubmit a command-buffer until the + // previous one has already completed. + // Indeed, since a command-list does not accept a list a dependencies at + // submission, we circumvent this lack by adding a barrier that waits on a + // specific event and then define the conditions to signal this event in + // another command-list. Consequently, if a second submission is performed, + // the signal conditions of this single event are redefined by this second + // submission. Thus, this can lead to an undefined behaviour and potential + // hangs. We have therefore to expliclty wait in the host for + // previous submission to complete before resubmitting the command-buffer + // for level-zero backend. + // TODO : add a check to release this constraint and allow multiple + // concurrent submissions if the exec_graph has been updated since the last + // submission. + for (std::vector::iterator It = + MExecutionEvents.begin(); + It != MExecutionEvents.end();) { + auto Event = *It; + if (!Event->isCompleted()) { + if (Queue->get_device().get_backend() == + sycl::backend::ext_oneapi_level_zero) { + Event->wait(Event); + } else { + CGData.MEvents.push_back(Event); + } + ++It; + } else { + // Remove completed events + It = MExecutionEvents.erase(It); + } } + NewEvent = CreateNewEvent(); sycl::detail::pi::PiEvent *OutEvent = &NewEvent->getHandleRef(); // Merge requirements from the nodes into requirements (if any) from the diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 949b08d480e75..061d564138424 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -917,20 +917,6 @@ class exec_graph_impl { /// @return pointer to the graph_impl MGraphImpl const std::shared_ptr &getGraphImpl() const { return MGraphImpl; } - /// Checks if the previous submissions of this graph have been completed - /// This function checks the status of events associated to the previous graph - /// submissions. - /// @return true if all previous submissions have been completed, false - /// otherwise. - bool previousSubmissionCompleted() const { - for (auto Event : MExecutionEvents) { - if (!Event->isCompleted()) { - return false; - } - } - return true; - } - private: /// Create a command-group for the node and add it to command-buffer by going /// through the scheduler. diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp index d4f0bd4c8045a..e722e9016296b 100644 --- a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp @@ -72,8 +72,6 @@ int main() { }); }); - E4.wait(); - // Buffer elements set to 22 Queue.submit([&](handler &CGH) { CGH.depends_on(E5); diff --git a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp index aa39218bc5f9b..9452b05850d12 100644 --- a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp @@ -64,18 +64,14 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExecAdditional); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp index ea46da424dccf..5511081295424 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp @@ -42,10 +42,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp index c5674ba6b4f67..068e8fed4bedf 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp @@ -50,10 +50,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp index 153ef5ec0b95b..88d5ed57ba1c9 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp @@ -45,10 +45,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp index 91d8977a6f8c6..dd67f8f3a5286 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp @@ -48,10 +48,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp index 2ae585e095800..0ed6c91e58e40 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp @@ -45,10 +45,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp index 07deda4a18b3e..6322b0923b851 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp @@ -45,7 +45,6 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp index 26bcb74dfc499..350a4dcb8e7ff 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp @@ -111,10 +111,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp index a29139b997a87..46d046761efab 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp @@ -106,10 +106,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp index 2cfe245c9e423..64aab69956562 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp @@ -90,7 +90,6 @@ int main() { }); }); - Event.wait(); // Buffer elements set to 22 Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); diff --git a/sycl/test-e2e/Graph/Inputs/host_task.cpp b/sycl/test-e2e/Graph/Inputs/host_task.cpp index f7f1d0d30f393..78f1d755a7b12 100644 --- a/sycl/test-e2e/Graph/Inputs/host_task.cpp +++ b/sycl/test-e2e/Graph/Inputs/host_task.cpp @@ -73,7 +73,6 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); } diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp index 18a922d91c694..8ea6a923025da 100644 --- a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp @@ -43,7 +43,6 @@ int main() { for (unsigned n = 0; n < Iterations; n++) { auto GraphExec = Graph.finalize(); Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); } diff --git a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp index 0e16ba16242c5..9471ab645eb6b 100644 --- a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp @@ -42,14 +42,12 @@ int main() { // Execute several iterations of the graph using the different shortcuts event Event = Queue.ext_oneapi_graph(GraphExec); - Event.wait(); assert(Iterations > 2); const size_t LoopIterations = Iterations - 2; std::vector Events(LoopIterations); for (unsigned n = 0; n < LoopIterations; n++) { Events[n] = Queue.ext_oneapi_graph(GraphExec, Event); - Events[n].wait(); } Queue.ext_oneapi_graph(GraphExec, Events).wait(); diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp index 725aacc778d94..a02feb72d164c 100644 --- a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp +++ b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp @@ -104,10 +104,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(MainGraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp index 2a251d2edb688..c6bc14cc62e9b 100644 --- a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp +++ b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp @@ -49,10 +49,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.copy(BufferA.get_access(), DataA.data()); diff --git a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp index 9904b2da7dd47..d38ae78ae4da4 100644 --- a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp +++ b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp @@ -104,10 +104,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.copy(PtrA, DataA.data(), Size, Event); diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp index eb0ed27358e04..44cffdb5ad508 100644 --- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp @@ -54,10 +54,8 @@ int main() { // before graph recording) for (unsigned n = 1; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp index 2935984f433fa..7781bb031287a 100644 --- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp @@ -100,10 +100,8 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/submission_while_executing.cpp b/sycl/test-e2e/Graph/submission_while_executing.cpp deleted file mode 100644 index 80fbb26f8ddae..0000000000000 --- a/sycl/test-e2e/Graph/submission_while_executing.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out -// RUN: %if ext_oneapi_level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %} -// -// CHECK-NOT: LEAK - -// Test calling queue::submit(graph) while the previous submission of graph has -// not been completed. The second run is to check that there are no leaks -// reported with the embedded UR_L0_LEAKS_DEBUG=1 testing capability. - -#include "graph_common.hpp" - -inline bool -isSubmittedOrRunningCommand(sycl::info::event_command_status Status) { - return ((Status == sycl::info::event_command_status::submitted) || - (Status == sycl::info::event_command_status::running)); -} - -int main() { - queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; - - if (!are_graphs_supported(Queue)) { - return 0; - } - - using T = int; - - size_t LargeSize = - 10000; // we use large Size to increase the kernel execution time - size_t NumIterations = 10; - size_t SuccessfulSubmissions = 0; - - std::vector DataA(LargeSize), DataB(LargeSize), DataC(LargeSize); - - std::iota(DataA.begin(), DataA.end(), 1); - std::iota(DataB.begin(), DataB.end(), 10); - std::iota(DataC.begin(), DataC.end(), 1000); - - std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); - - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - - T *PtrA = malloc_device(LargeSize, Queue); - T *PtrB = malloc_device(LargeSize, Queue); - T *PtrC = malloc_device(LargeSize, Queue); - - Queue.copy(DataA.data(), PtrA, LargeSize); - Queue.copy(DataB.data(), PtrB, LargeSize); - Queue.copy(DataC.data(), PtrC, LargeSize); - Queue.wait_and_throw(); - - Graph.begin_recording(Queue); - run_kernels_usm(Queue, LargeSize, PtrA, PtrB, PtrC); - Graph.end_recording(); - - auto GraphExec = Graph.finalize(); - - // Serial Submissions - for (unsigned i = 0; i < NumIterations; ++i) { - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - Queue.wait_and_throw(); - } - - // Concurrent Submissions - sycl::event PreEvent, Event; - sycl::info::event_command_status PreEventInfoStateBefore = - sycl::info::event_command_status::ext_oneapi_unknown; - sycl::info::event_command_status PreEventInfoStateAfter = - sycl::info::event_command_status::ext_oneapi_unknown; - for (unsigned i = 0; i < NumIterations; ++i) { - std::error_code ErrorCode = make_error_code(sycl::errc::success); - PreEventInfoStateBefore = - PreEvent.get_info(); - - // Submit the kernel - try { - Event = - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - } catch (const sycl::exception &e) { - ErrorCode = e.code(); - } - PreEventInfoStateAfter = - PreEvent.get_info(); - - // Check submission status - if (isSubmittedOrRunningCommand(PreEventInfoStateBefore) && - isSubmittedOrRunningCommand(PreEventInfoStateAfter)) { - assert(ErrorCode == sycl::errc::invalid); - } else if (PreEventInfoStateBefore == - sycl::info::event_command_status::complete) { - // Submission has succeeded - SuccessfulSubmissions++; - PreEvent = Event; - } else { - // We cannot be sure of the state of the previous task when the current - // submission occurred because `PreEventInfoStateBefore` and - // `PreEventInfoStateAfter` indicate different status We therefore only - // read the submission status and increment the number of successful - // submissions if the submission was successful - if (ErrorCode == sycl::errc::success) { - SuccessfulSubmissions++; - PreEvent = Event; - } - } - } - Queue.wait_and_throw(); - - Queue.copy(PtrA, DataA.data(), LargeSize); - Queue.copy(PtrB, DataB.data(), LargeSize); - Queue.copy(PtrC, DataC.data(), LargeSize); - Queue.wait_and_throw(); - - free(PtrA, Queue); - free(PtrB, Queue); - free(PtrC, Queue); - - // Compute the reference based on the total number of successful executions - calculate_reference_data(NumIterations + SuccessfulSubmissions, LargeSize, - ReferenceA, ReferenceB, ReferenceC); - - for (size_t i = 0; i < Size; i++) { - assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); - assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); - assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); - } - - return 0; -}