Skip to content

Commit

Permalink
[SYCL][Graph] Support for sycl_ext_oneapi_enqueue_barrier extension (#…
Browse files Browse the repository at this point in the history
…301)

* [SYCL][Graph] Support for sycl_ext_oneapi_enqueue_barrier extension

Adds support to handle barrier enqueuing with Record&Replay API.
Barriers are implemented as empty nodes enforcing the required dependencies.

Adds tests that check 1) correctness of graph structure when barriers have been enqueued, 2) processing behavior, 3) exception throwing if barriers are used within explicit API.

Notes:
1) Multi-queues barrier is not supported since it does not make sense with asynchronous graph execution.
2) Barriers can only be used with Record&Replay API, since barriers rely on events to enforce dependencies.

* [SYCL][Graph] Adds unitest with multiple barriers and test-e2e

Adds unitest with multiple barriers and	test-e2e.
Corrects some typos.

* Update sycl/source/detail/graph_impl.cpp

Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>

---------

Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
  • Loading branch information
mfrancepillois and Bensuo committed Aug 22, 2023
1 parent 5318388 commit adaaaed
Show file tree
Hide file tree
Showing 6 changed files with 377 additions and 106 deletions.
3 changes: 0 additions & 3 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2560,9 +2560,6 @@ class __SYCL_EXPORT handler {
/// until all commands previously submitted to this queue have entered the
/// complete state.
void ext_oneapi_barrier() {
throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_enqueue_barrier>();
throwIfActionIsCreated();
setType(detail::CG::Barrier);
}
Expand Down
40 changes: 38 additions & 2 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,17 @@ void graph_impl::removeRoot(const std::shared_ptr<node_impl> &Root) {

std::shared_ptr<node_impl>
graph_impl::add(const std::vector<std::shared_ptr<node_impl>> &Dep) {
// Copy deps so we can modify them
auto Deps = Dep;

const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();

// Add any deps from the vector of extra dependencies
Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());

// TODO: Encapsulate in separate function to avoid duplication
if (!Dep.empty()) {
for (auto N : Dep) {
if (!Deps.empty()) {
for (auto N : Deps) {
N->registerSuccessor(NodeImpl, N); // register successor
this->removeRoot(NodeImpl); // remove receiver from root node
// list
Expand All @@ -239,6 +245,13 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
CGF(Handler);
Handler.finalize();

if (Handler.MCGType == sycl::detail::CG::Barrier) {
throw sycl::exception(
make_error_code(errc::invalid),
"The sycl_ext_oneapi_enqueue_barrier feature is not available with "
"SYCL Graph Explicit API. Please use empty nodes instead.");
}

// If the handler recorded a subgraph return that here as the relevant nodes
// have already been added. The node returned here is an empty node with
// dependencies on all the exit nodes of the subgraph.
Expand Down Expand Up @@ -319,6 +332,9 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType,
// list
Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());

// Add any deps from the extra dependencies vector
Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());

const std::shared_ptr<node_impl> &NodeImpl =
std::make_shared<node_impl>(CGType, std::move(CommandGroup));
if (!Deps.empty()) {
Expand All @@ -330,6 +346,12 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType,
} else {
this->addRoot(NodeImpl);
}

// Set barrier nodes as prerequisites (new start points) for subsequent nodes
if (CGType == sycl::detail::CG::Barrier) {
MExtraDependencies.push_back(NodeImpl);
}

return NodeImpl;
}

Expand Down Expand Up @@ -441,6 +463,20 @@ void graph_impl::makeEdge(std::shared_ptr<node_impl> Src,
removeRoot(Dest); // remove receiver from root node list
}

std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents() {
std::vector<sycl::detail::EventImplPtr> Events;
auto EnqueueExitNodesEvents = [&](std::shared_ptr<node_impl> &Node,
std::deque<std::shared_ptr<node_impl>> &) {
if (Node->MSuccessors.size() == 0) {
Events.push_back(getEventForNode(Node));
}
return false;
};

searchDepthFirst(EnqueueExitNodesEvents);
return Events;
}

// Check if nodes are empty and if so loop back through predecessors until we
// find the real dependency.
void exec_graph_impl::findRealDeps(
Expand Down
22 changes: 19 additions & 3 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,13 @@ class node_impl {
}

/// Query if this is an empty node.
/// Barrier nodes are also considered empty nodes since they do not embed any
/// workload but only dependencies
/// @return True if this is an empty node, false otherwise.
bool isEmpty() const { return MCGType == sycl::detail::CG::None; }
bool isEmpty() const {
return ((MCGType == sycl::detail::CG::None) ||
(MCGType == sycl::detail::CG::Barrier));
}

/// Get a deep copy of this node's command group
/// @return A unique ptr to the new command group object.
Expand Down Expand Up @@ -319,8 +324,8 @@ class node_impl {
printDotCG(Stream);
for (const auto &Dep : MPredecessors) {
auto NodeDep = Dep.lock();
Stream << " \"" << MCommandGroup.get() << "\" -> \""
<< NodeDep->MCommandGroup.get() << "\"" << std::endl;
Stream << " \"" << NodeDep->MCommandGroup.get() << "\" -> \""
<< MCommandGroup.get() << "\"" << std::endl;
}

for (std::shared_ptr<node_impl> Succ : MSuccessors) {
Expand Down Expand Up @@ -677,6 +682,11 @@ class graph_impl {
return NumberOfNodes;
}

/// Traverse the graph recursively to get the events associated with the
/// output nodes of this graph.
/// @return vector of events associated to exit nodes.
std::vector<sycl::detail::EventImplPtr> getExitNodesEvents();

private:
/// Iterate over the graph depth-first and run \p NodeFunc on each node.
/// @param NodeFunc A function which receives as input a node in the graph to
Expand Down Expand Up @@ -738,6 +748,12 @@ class graph_impl {
/// @return An empty node is used to schedule dependencies on this sub-graph.
std::shared_ptr<node_impl>
addNodesToExits(const std::list<std::shared_ptr<node_impl>> &NodeList);

/// List of nodes that must be added as extra dependencies to new nodes when
/// added to this graph.
/// This list is mainly used by barrier nodes which must be considered
/// as predecessors for all nodes subsequently added to the graph.
std::vector<std::shared_ptr<node_impl>> MExtraDependencies;
};

/// Class representing the implementation of command_graph<executable>.
Expand Down
28 changes: 21 additions & 7 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,11 +368,28 @@ event handler::finalize() {
std::move(MArgs), std::move(CGData), MCGType, MCodeLoc));
break;
case detail::CG::Barrier:
case detail::CG::BarrierWaitlist:
CommandGroup.reset(new detail::CGBarrier(std::move(MEventsWaitWithBarrier),
std::move(CGData), MCGType,
MCodeLoc));
case detail::CG::BarrierWaitlist: {
if (auto GraphImpl = getCommandGraph(); GraphImpl != nullptr) {
// if no event to wait for was specified, we add all the previous
// nodes/events of the graph
if (MEventsWaitWithBarrier.size() == 0) {
MEventsWaitWithBarrier = GraphImpl->getExitNodesEvents();
}
CGData.MEvents.insert(std::end(CGData.MEvents),
std::begin(MEventsWaitWithBarrier),
std::end(MEventsWaitWithBarrier));
// Barrier node is implemented as an empty node in Graph
// but keep the barrier type to help managing dependencies
MCGType = detail::CG::Barrier;
CommandGroup.reset(
new detail::CG(detail::CG::Barrier, std::move(CGData), MCodeLoc));
} else {
CommandGroup.reset(
new detail::CGBarrier(std::move(MEventsWaitWithBarrier),
std::move(CGData), MCGType, MCodeLoc));
}
break;
}
case detail::CG::CopyToDeviceGlobal: {
CommandGroup.reset(new detail::CGCopyToDeviceGlobal(
MSrcPtr, MDstPtr, MImpl->MIsDeviceImageScoped, MLength, MImpl->MOffset,
Expand Down Expand Up @@ -806,9 +823,6 @@ void handler::verifyUsedKernelBundle(const std::string &KernelName) {
}

void handler::ext_oneapi_barrier(const std::vector<event> &WaitList) {
throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_enqueue_barrier>();
throwIfActionIsCreated();
MCGType = detail::CG::BarrierWaitlist;
MEventsWaitWithBarrier.resize(WaitList.size());
Expand Down
120 changes: 120 additions & 0 deletions sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// REQUIRES: level_zero, gpu
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using ZE_DEBUG
// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
//
// CHECK-NOT: LEAK

#include "../graph_common.hpp"

//// Test Explicit API graph construction with USM.
///
/// @param Q Command-queue to make kernel submissions to.
/// @param Size Number of elements in the buffers.
/// @param DataA Pointer to first USM allocation to use in kernels.
/// @param DataB Pointer to second USM allocation to use in kernels.
/// @param DataC Pointer to third USM allocation to use in kernels.
///
/// @return Event corresponding to the exit node of the submission sequence.
template <typename T>
event run_kernels_usm_with_barrier(queue Q, const size_t Size, T *DataA,
T *DataB, T *DataC) {
// Read & write Buffer A
auto EventA = Q.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
auto LinID = Id.get_linear_id();
DataA[LinID]++;
});
});

Q.ext_oneapi_submit_barrier();

// Reads Buffer A
// Read & Write Buffer B
auto EventB = Q.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
auto LinID = Id.get_linear_id();
DataB[LinID] += DataA[LinID];
});
});

// Reads Buffer A
// Read & writes Buffer C
auto EventC = Q.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
auto LinID = Id.get_linear_id();
DataC[LinID] -= DataA[LinID];
});
});

Q.ext_oneapi_submit_barrier();

// Read & write Buffers B and C
auto ExitEvent = Q.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
auto LinID = Id.get_linear_id();
DataB[LinID]--;
DataC[LinID]--;
});
});
return ExitEvent;
}

int main() {
queue Queue;

using T = int;

std::vector<T> DataA(Size), DataB(Size), DataC(Size);

std::iota(DataA.begin(), DataA.end(), 1);
std::iota(DataB.begin(), DataB.end(), 10);
std::iota(DataC.begin(), DataC.end(), 1000);

std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
ReferenceC);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

// Add commands to graph
Graph.begin_recording(Queue);
auto ev = run_kernels_usm_with_barrier(Queue, Size, PtrA, PtrB, PtrC);
Graph.end_recording(Queue);

auto GraphExec = Graph.finalize();

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event = Queue.submit([&](handler &CGH) {
CGH.depends_on(Event);
CGH.ext_oneapi_graph(GraphExec);
});
}
Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.copy(PtrC, DataC.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

assert(ReferenceA == DataA);
assert(ReferenceB == DataB);
assert(ReferenceC == DataC);

return 0;
}
Loading

0 comments on commit adaaaed

Please sign in to comment.