diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 75466bc71181d..61b7895f6abd5 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -623,6 +623,11 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, sycl::detail::EventImplPtr NewEvent; if (CommandBuffer) { + if (!previousSubmissionCompleted()) { + throw sycl::exception(make_error_code(errc::invalid), + "This Graph cannot be submitted at the moment " + "because the previous run has not yet completed."); + } NewEvent = CreateNewEvent(); sycl::detail::pi::PiEvent *OutEvent = &NewEvent->getHandleRef(); // Merge requirements from the nodes into requirements (if any) from the diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 583ae3ec5e5d2..1a725a2a55ba7 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -869,6 +870,20 @@ class exec_graph_impl { return false; } + /// Checks if the previous submissions of this graph have been completed + /// This function checks the status of events associated to the previous graph + /// submissions. + /// @return true if all previous submissions have been completed, false + /// otherwise. + bool previousSubmissionCompleted() const { + for (auto Event : MExecutionEvents) { + if (!Event->isCompleted()) { + return false; + } + } + return true; + } + private: /// Create a command-group for the node and add it to command-buffer by /// going through the scheduler. diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp index 9739828a044f4..3e8c0500bcba1 100644 --- a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp @@ -70,6 +70,8 @@ int main() { }); }); + E4.wait(); + // Buffer elements set to 22 Queue.submit([&](handler &CGH) { CGH.depends_on(E5); diff --git a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp index 432eb2cf33e00..68e1712409281 100644 --- a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp @@ -63,6 +63,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } for (unsigned n = 0; n < Iterations; n++) { @@ -70,6 +71,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExecAdditional); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp index b3c8246feb5a7..e3446e1cc0a60 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp @@ -41,6 +41,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp index 833065e1d3756..cf48b955b9459 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp @@ -49,6 +49,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp index c39782e0d6149..44cae7ac52235 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp @@ -44,6 +44,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp index 626dd3b1ee950..a11b201d1476b 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp @@ -47,6 +47,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp index 469394b7883e1..da4d951583c0f 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp @@ -44,6 +44,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp index 9770e61fe8bc5..3430cc8cf9124 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp @@ -110,6 +110,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp index 268617721431e..2b2020d68faf9 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp @@ -105,6 +105,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp index c3307d8877752..708a270e7b735 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp @@ -72,7 +72,8 @@ int main() { }); // Buffer elements set to 10 - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + auto Event = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); // Buffer elements set to 20 Queue.submit([&](handler &CGH) { @@ -83,6 +84,7 @@ int main() { }); }); + Event.wait(); // Buffer elements set to 22 Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); diff --git a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp index 97abbf1f68d8e..7e5cc9c3a99d4 100644 --- a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp @@ -38,12 +38,14 @@ int main() { // Execute several iterations of the graph using the different shortcuts event Event = Queue.ext_oneapi_graph(GraphExec); + Event.wait(); assert(Iterations > 2); const size_t LoopIterations = Iterations - 2; std::vector Events(LoopIterations); for (unsigned n = 0; n < LoopIterations; n++) { Events[n] = Queue.ext_oneapi_graph(GraphExec, Event); + Events[n].wait(); } Queue.ext_oneapi_graph(GraphExec, Events).wait(); diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp index c1d5051158ca1..203bb41544fa2 100644 --- a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp +++ b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp @@ -103,6 +103,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(MainGraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp index 5a71c39da70b9..b5b98c4a884cc 100644 --- a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp +++ b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp @@ -48,6 +48,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.copy(BufferA.get_access(), DataA.data()); diff --git a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp index cabf0e61e5ffe..640565364cc52 100644 --- a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp +++ b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp @@ -103,6 +103,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.copy(PtrA, DataA.data(), Size, Event); diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp index c08ac268eea9a..bcfa06e6148bd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp @@ -54,6 +54,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp index cb2b9ddfa9a53..e09fe4bbf9ef7 100644 --- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp @@ -100,6 +100,7 @@ int main() { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + Event.wait(); } Queue.wait_and_throw(); diff --git a/sycl/test-e2e/Graph/Threading/submit.cpp b/sycl/test-e2e/Graph/Threading/submit.cpp deleted file mode 100644 index 1276836669259..0000000000000 --- a/sycl/test-e2e/Graph/Threading/submit.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// REQUIRES: cuda || level_zero, gpu -// RUN: %{build_pthread_inc} -o %t.out -// RUN: %{run} %t.out -// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} -// -// CHECK-NOT: LEAK - -// Test calling queue::submit(graph) in a threaded situation. -// The second run is to check that there are no leaks reported with the embedded -// ZE_DEBUG=4 testing capability. - -#include "../graph_common.hpp" - -#include - -int main() { - queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; - - using T = int; - - const unsigned NumThreads = std::thread::hardware_concurrency(); - std::vector DataA(Size), DataB(Size), DataC(Size); - - std::iota(DataA.begin(), DataA.end(), 1); - std::iota(DataB.begin(), DataB.end(), 10); - std::iota(DataC.begin(), DataC.end(), 1000); - - std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); - calculate_reference_data(NumThreads, Size, ReferenceA, ReferenceB, - ReferenceC); - - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - - T *PtrA = malloc_device(Size, Queue); - T *PtrB = malloc_device(Size, Queue); - T *PtrC = malloc_device(Size, Queue); - - Queue.copy(DataA.data(), PtrA, Size); - Queue.copy(DataB.data(), PtrB, Size); - Queue.copy(DataC.data(), PtrC, Size); - Queue.wait_and_throw(); - - Graph.begin_recording(Queue); - run_kernels_usm(Queue, Size, PtrA, PtrB, PtrC); - Graph.end_recording(); - - Barrier SyncPoint{NumThreads}; - - auto GraphExec = Graph.finalize(); - auto SubmitGraph = [&]() { - SyncPoint.wait(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - }; - - std::vector Threads; - Threads.reserve(NumThreads); - - for (unsigned i = 0; i < NumThreads; ++i) { - Threads.emplace_back(SubmitGraph); - } - - for (unsigned i = 0; i < NumThreads; ++i) { - Threads[i].join(); - } - - Queue.wait_and_throw(); - - Queue.copy(PtrA, DataA.data(), Size); - Queue.copy(PtrB, DataB.data(), Size); - Queue.copy(PtrC, DataC.data(), Size); - Queue.wait_and_throw(); - - free(PtrA, Queue); - free(PtrB, Queue); - free(PtrC, Queue); - - assert(ReferenceA == DataA); - assert(ReferenceB == DataB); - assert(ReferenceC == DataC); - - return 0; -} diff --git a/sycl/test-e2e/Graph/submission_while_executing.cpp b/sycl/test-e2e/Graph/submission_while_executing.cpp new file mode 100644 index 0000000000000..316bc6f0ecafb --- /dev/null +++ b/sycl/test-e2e/Graph/submission_while_executing.cpp @@ -0,0 +1,96 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Test calling queue::submit(graph) while the previous submission of graph has +// not been completed. The second run is to check that there are no leaks +// reported with the embedded ZE_DEBUG=4 testing capability. + +#include "graph_common.hpp" + +int main() { + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + using T = int; + + size_t LargeSize = + 1000000; // we use large Size to increase the kernel execution time + size_t NumIterations = 10; + size_t SuccessfulSubmissions = 0; + + std::vector DataA(LargeSize), DataB(LargeSize), DataC(LargeSize); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(LargeSize, Queue); + T *PtrB = malloc_device(LargeSize, Queue); + T *PtrC = malloc_device(LargeSize, Queue); + + Queue.copy(DataA.data(), PtrA, LargeSize); + Queue.copy(DataB.data(), PtrB, LargeSize); + Queue.copy(DataC.data(), PtrC, LargeSize); + Queue.wait_and_throw(); + + Graph.begin_recording(Queue); + run_kernels_usm(Queue, LargeSize, PtrA, PtrB, PtrC); + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + // Serial Submissions + for (unsigned i = 0; i < NumIterations; ++i) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + Queue.wait_and_throw(); + } + + // Concurrent Submissions + sycl::event Event; + sycl::info::event_command_status PreEventInfo = + sycl::info::event_command_status::ext_oneapi_unknown; + std::error_code ErrorCode = make_error_code(sycl::errc::success); + for (unsigned i = 0; i < NumIterations; ++i) { + try { + Event = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + if ((PreEventInfo == sycl::info::event_command_status::submitted) || + (PreEventInfo == sycl::info::event_command_status::running)) { + assert(ErrorCode == sycl::errc::invalid); + } else { + // Submission has succeeded + SuccessfulSubmissions++; + } + PreEventInfo = + Event.get_info(); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), LargeSize); + Queue.copy(PtrB, DataB.data(), LargeSize); + Queue.copy(PtrC, DataC.data(), LargeSize); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + // Compute the reference based on the total number of successful executions + calculate_reference_data(NumIterations + SuccessfulSubmissions, LargeSize, + ReferenceA, ReferenceB, ReferenceC); + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +}