Skip to content

Commit

Permalink
[SYCL][Graph] Fix bug when host-task is submitted to in-order queue (#…
Browse files Browse the repository at this point in the history
…322)

* [SYCL][Graph] Fix bug when host-task is submitted to in-order queue

When a host-task is submitted to in-order queue, dependency between this host-task and the successor is explicitly handled.
However, when we record an in-order queue, the recorded CG are not part of the regular in-order queue execution sequence.
But inter-CG dependancies are managed by the graph implementation.
This PR implements this point and ensures that recording an in-order does not impact the normal execution sequence.
Tests (e2e and unitest) have been added to check it.

 Adds and Renames tests (e2e and unitests)
  • Loading branch information
mfrancepillois committed Sep 26, 2023
1 parent 7429d3f commit 52a1117
Show file tree
Hide file tree
Showing 6 changed files with 499 additions and 6 deletions.
30 changes: 25 additions & 5 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (isInOrder() && (getCommandGraph() == nullptr)) {
MLastEvent = ResEvent;
// We don't create a command group for usm commands, so set it to None.
// This variable is used to perform explicit dependency management when
Expand Down Expand Up @@ -198,7 +202,11 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (isInOrder() && (getCommandGraph() == nullptr)) {
MLastEvent = ResEvent;
// We don't create a command group for usm commands, so set it to None.
// This variable is used to perform explicit dependency management when
Expand Down Expand Up @@ -241,7 +249,11 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (isInOrder() && (getCommandGraph() == nullptr)) {
MLastEvent = ResEvent;
// We don't create a command group for usm commands, so set it to None.
// This variable is used to perform explicit dependency management when
Expand Down Expand Up @@ -286,7 +298,11 @@ event queue_impl::memcpyToDeviceGlobal(
if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (isInOrder() && (getCommandGraph() == nullptr)) {
MLastEvent = ResEvent;
// We don't create a command group for usm commands, so set it to None.
// This variable is used to perform explicit dependency management when
Expand Down Expand Up @@ -331,7 +347,11 @@ event queue_impl::memcpyFromDeviceGlobal(
if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (isInOrder() && (getCommandGraph() == nullptr)) {
MLastEvent = ResEvent;
// We don't create a command group for usm commands, so set it to None.
// This variable is used to perform explicit dependency management when
Expand Down
6 changes: 5 additions & 1 deletion sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,11 @@ class queue_impl {
template <typename HandlerType = handler>
void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,
event &EventRet) {
if (MIsInorder) {
// When a queue is recorded by a graph, the dependencies are managed in the
// graph implementaton. Additionally, CG recorded for a graph are outside of
// the in-order queue execution sequence. Therefore, these CG must not
// update MLastEvent.
if (MIsInorder && (getCommandGraph() == nullptr)) {

auto IsExpDepManaged = [](const CG::CGTYPE &Type) {
return Type == CG::CGTYPE::CodeplayHostTask;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// REQUIRES: cuda || level_zero, gpu
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Tests submitting an host kernel to an in-order queue before recording
// commands from it.

#include "../graph_common.hpp"

int main() {
using T = int;

queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
sycl::property::queue::in_order{}}};

// Check if device has usm shared allocation
if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
return 0;

T *TestData = sycl::malloc_shared<T>(Size, Queue);

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

Queue.submit([&](handler &CGH) {
CGH.host_task([=]() {
for (size_t i = 0; i < Size; i++) {
TestData[i] = static_cast<T>(i);
}
});
});

Graph.begin_recording(Queue);

auto GraphEvent = Queue.submit([&](handler &CGH) {
CGH.single_task<class TestKernel2>([=]() {
for (size_t i = 0; i < Size; i++) {
TestData[i] += static_cast<T>(i);
}
});
});

Graph.end_recording(Queue);

auto GraphExec = Graph.finalize();
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });

Queue.submit([&](handler &CGH) {
CGH.single_task<class TestKernel3>([=]() {
for (size_t i = 0; i < Size; i++) {
TestData[i] *= static_cast<T>(i);
}
});
});

Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
assert(TestData[i] == ((i + i) * i));
}

sycl::free(TestData, Queue);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// REQUIRES: cuda || level_zero, gpu
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Tests submitting memcpy to an in-order queue before recording
// commands from it.

#include "../graph_common.hpp"

int main() {
using T = int;

queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
sycl::property::queue::in_order{}}};

// Check if device has usm shared allocation
if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
return 0;

std::vector<T> TestDataIn(Size);
T *TestData = sycl::malloc_shared<T>(Size, Queue);
T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

std::memset(TestDataIn.data(), 1, Size * sizeof(T));
Queue.memcpy(TestData, TestDataIn.data(), Size * sizeof(T));

Graph.begin_recording(Queue);

auto GraphEvent = Queue.submit([&](handler &CGH) {
CGH.single_task<class TestKernel2>([=]() {
for (size_t i = 0; i < Size; i++) {
TestData[i] += static_cast<T>(i);
}
});
});

Graph.end_recording(Queue);

auto GraphExec = Graph.finalize();
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });

Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));

Queue.wait_and_throw();

std::vector<T> Reference(Size);
std::memset(Reference.data(), 1, Size * sizeof(T));
for (size_t i = 0; i < Size; i++) {
Reference[i] += i;
}

// Check Outputs
for (size_t i = 0; i < Size; i++) {
assert(TestDataOut[i] == Reference[i]);
}

sycl::free(TestData, Queue);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// REQUIRES: cuda || level_zero, gpu
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Tests submitting memset to an in-order queue before recording
// commands from it.

#include "../graph_common.hpp"

int main() {
using T = int;

queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{},
sycl::property::queue::in_order{}}};

// Check if device has usm shared allocation
if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations))
return 0;

T *TestData = sycl::malloc_shared<T>(Size, Queue);
T *TestDataOut = sycl::malloc_shared<T>(Size, Queue);

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

Queue.memset(TestData, 1, Size * sizeof(T));

Graph.begin_recording(Queue);

auto GraphEvent = Queue.submit([&](handler &CGH) {
CGH.single_task<class TestKernel2>([=]() {
for (size_t i = 0; i < Size; i++) {
TestData[i] += static_cast<T>(i);
}
});
});

Graph.end_recording(Queue);

auto GraphExec = Graph.finalize();
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });

Queue.memcpy(TestDataOut, TestData, Size * sizeof(T));

Queue.wait_and_throw();

std::vector<T> Reference(Size);
std::memset(Reference.data(), 1, Size * sizeof(T));
for (size_t i = 0; i < Size; i++) {
Reference[i] += i;
}

// Check Outputs
for (size_t i = 0; i < Size; i++) {
assert(TestDataOut[i] == Reference[i]);
}

sycl::free(TestData, Queue);

return 0;
}
Loading

0 comments on commit 52a1117

Please sign in to comment.