Skip to content

Commit

Permalink
[SYCL][Graph][Doc] Add SYCL-Graph usage guide and example doc
Browse files Browse the repository at this point in the history
- Move examples from spec to reduce bloat
- Adds some usage guidelines for common scenarios
- Remove reductions from examples since they are not supported
  • Loading branch information
Bensuo committed Aug 6, 2024
1 parent b575283 commit abb785a
Show file tree
Hide file tree
Showing 3 changed files with 519 additions and 351 deletions.
354 changes: 3 additions & 351 deletions sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -1953,358 +1953,10 @@ code `invalid` if a user tries to add them to a graph.
Removing this restriction is something we may look at for future revisions of
`sycl_ext_oneapi_graph`.

== Examples
== Examples and Usage Guide

[NOTE]
====
The examples below demonstrate intended usage of the extension, but may not be
compatible with the proof-of-concept implementation, as the proof-of-concept
implementation is currently under development.
====

Examples for demonstrative purposes only, and may leave out details such as how
input data is set.

=== Dot Product

[source,c++]
----
...
#include <sycl/ext/oneapi/experimental/graph.hpp>
int main() {
namespace sycl_ext = sycl::ext::oneapi::experimental;
const size_t n = 10;
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;
sycl::queue q;
sycl_ext::command_graph g(q.get_context(), q.get_device());
float *dotp = sycl::malloc_shared<float>(1, q);
float *x = sycl::malloc_device<float>(n, q);
float *y = sycl::malloc_device<float>(n, q);
float *z = sycl::malloc_device<float>(n, q);
// Add commands to the graph to create the following topology.
//
// i
// / \
// a b
// \ /
// c
/* init data on the device */
auto node_i = g.add([&](sycl::handler& h) {
h.parallel_for(n, [=](sycl::id<1> it){
const size_t i = it[0];
x[i] = 1.0f;
y[i] = 2.0f;
z[i] = 3.0f;
});
});
auto node_a = g.add([&](sycl::handler& h) {
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
x[i] = alpha * x[i] + beta * y[i];
});
}, { sycl_ext::property::node::depends_on(node_i)});
auto node_b = g.add([&](sycl::handler& h) {
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
z[i] = gamma * z[i] + beta * y[i];
});
}, { sycl_ext::property::node::depends_on(node_i)});
auto node_c = g.add(
[&](sycl::handler& h) {
h.parallel_for(sycl::range<1>{n},
sycl::reduction(dotp, 0.0f, std::plus()),
[=](sycl::id<1> it, auto &sum) {
const size_t i = it[0];
sum += x[i] * z[i];
});
},
{ sycl_ext::property::node::depends_on(node_a, node_b)});
auto exec = g.finalize();
// use queue shortcut for graph submission
q.ext_oneapi_graph(exec).wait();
// memory can be freed inside or outside the graph
sycl::free(x, q);
sycl::free(y, q);
sycl::free(z, q);
sycl::free(dotp, q);
return 0;
}
...
----

=== Diamond Dependency

The following snippet of code shows how a SYCL `queue` can be put into a
recording state, which allows a `command_graph` object to be populated by the
command-groups submitted to the queue. Once the graph is complete, recording
finishes on the queue to put it back into the default executing state. The
graph is then finalized so that no more nodes can be added. Lastly, the graph is
submitted in its entirety for execution via
`handler::ext_oneapi_graph(command_graph<graph_state::executable>)`.

[source, c++]
----
using namespace sycl;
namespace sycl_ext = sycl::ext::oneapi::experimental;
queue q{default_selector{}};
// Lifetime of buffers must exceed the lifetime of graphs they are used in.
buffer<T> bufferA{dataA.data(), range<1>{elements}};
bufferA.set_write_back(false);
buffer<T> bufferB{dataB.data(), range<1>{elements}};
bufferB.set_write_back(false);
buffer<T> bufferC{dataC.data(), range<1>{elements}};
bufferC.set_write_back(false);
{
// New object representing graph of command-groups
sycl_ext::command_graph graph(q.get_context(), q.get_device(),
{sycl_ext::property::graph::assume_buffer_outlives_graph{}});
// `q` will be put in the recording state where commands are recorded to
// `graph` rather than submitted for execution immediately.
graph.begin_recording(q);
// Record commands to `graph` with the following topology.
//
// increment_kernel
// / \
// A->/ A->\
// / \
// add_kernel subtract_kernel
// \ /
// B->\ C->/
// \ /
// decrement_kernel
q.submit([&](handler& cgh) {
auto pData = bufferA.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<increment_kernel>(range<1>(elements),
[=](item<1> id) { pData[id]++; });
});
q.submit([&](handler& cgh) {
auto pData1 = bufferA.get_access<access::mode::read>(cgh);
auto pData2 = bufferB.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<add_kernel>(range<1>(elements),
[=](item<1> id) { pData2[id] += pData1[id]; });
});
q.submit([&](handler& cgh) {
auto pData1 = bufferA.get_access<access::mode::read>(cgh);
auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<subtract_kernel>(
range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; });
});
q.submit([&](handler& cgh) {
auto pData1 = bufferB.get_access<access::mode::read_write>(cgh);
auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<decrement_kernel>(range<1>(elements), [=](item<1> id) {
pData1[id]--;
pData2[id]--;
});
});
// queue `q` will be returned to the executing state where commands are
// submitted immediately for extension.
graph.end_recording();
// Finalize the modifiable graph to create an executable graph that can be
// submitted for execution.
auto exec_graph = graph.finalize();
// Execute graph
q.submit([&](handler& cgh) {
cgh.ext_oneapi_graph(exec_graph);
}).wait();
}
// Check output using host accessors
host_accessor hostAccA(bufferA);
host_accessor hostAccB(bufferB);
host_accessor hostAccC(bufferC);
...
----

=== Dynamic Parameter Update

Example showing a graph with a single kernel node that is created using a kernel
bundle with `handler::set_args()` and having its node arguments updated.

[source,c++]
----
...
using namespace sycl;
namespace sycl_ext = sycl::ext::oneapi::experimental;
queue myQueue;
auto myContext = myQueue.get_context();
auto myDevice = myQueue.get_device();
// USM allocations for kernel input/output
const size_t n = 1024;
int *ptrX = malloc_shared<int>(n, myQueue);
int *ptrY = malloc_device<int>(n, myQueue);
int *ptrZ = malloc_shared<int>(n, myQueue);
int *ptrQ = malloc_device<int>(n, myQueue);
// Kernel loaded from kernel bundle
const std::vector<kernel_id> builtinKernelIds =
myDevice.get_info<info::device::built_in_kernel_ids>();
kernel_bundle<bundle_state::executable> myBundle =
get_kernel_bundle(myContext, { myDevice }, builtinKernelIds);
kernel builtinKernel = myBundle.get_kernel(builtinKernelIds[0]);
// Graph containing a two kernels node
sycl_ext::command_graph myGraph(myContext, myDevice);
int myScalar = 42;
// Create graph dynamic parameters
dynamic_parameter dynParamInput(myGraph, ptrX);
dynamic_parameter dynParamScalar(myGraph, myScalar);
// First node uses ptrX as an input & output parameter, with operand
// mySclar as another argument.
node nodeA = myGraph.add([&](handler& cgh) {
cgh.set_args(dynParamInput, ptrY, dynParamScalar);
cgh.parallel_for(range {n}, builtinKernel);
});
// Create an executable graph with the updatable property.
auto execGraph = myGraph.finalize({sycl_ext::property::graph::updatable});
// Execute graph, then update without needing to wait for it to complete
myQueue.ext_oneapi_graph(execGraph);
// Change ptrX argument to node A to ptrZ
dynParamInput.update(ptrZ);
// Change myScalar argument to node A to newScalar
int newScalar = 12;
dynParamScalar.update(newScalar);
// Update nodeA in the executable graph with the new parameters
execGraph.update(nodeA);
// Execute graph again
myQueue.ext_oneapi_graph(execGraph);
myQueue.wait();
sycl::free(ptrX, myQueue);
sycl::free(ptrY, myQueue);
sycl::free(ptrZ, myQueue);
sycl::free(ptrQ, myQueue);
----

Example snippet showing how to use accessors with `dynamic_parameter` update:
[source,c++]
----
sycl::buffer bufferA{...};
sycl::buffer bufferB{...};
// Create graph dynamic parameter using a placeholder accessor, since the
// sycl::handler is not available here outside of the command-group scope.
dynamic_parameter dynParamAccessor(myGraph, bufferA.get_access());
node nodeA = myGraph.add([&](handler& cgh) {
// Require the accessor contained in the dynamic paramter
cgh.require(dynParamAccessor);
// Set the arg on the kernel using the dynamic parameter directly
cgh.set_args(dynParamAccessor);
cgh.parallel_for(range {n}, builtinKernel);
});
...
// Update the dynamic parameter with a placeholder accessor from bufferB instead
dynParamAccessor.update(bufferB.get_access());
----

=== Whole Graph Update

Example that shows recording and updating several nodes with different
parameters using <<whole-graph-update, Whole Graph Update>>.

[source,c++]
----
...
using namespace sycl;
namespace sycl_ext = sycl::ext::oneapi::experimental;
// Enqueue several kernels which use inputPtr
void run_kernels(int* inputPtr, queue syclQueue){
event eventA = syclQueue.submit([&](handler& CGH){
CGH.parallel_for(...);
});
event eventB = syclQueue.submit([&](handler& CGH){
CGH.depends_on(eventA);
CGH.parallel_for(...);
});
syclQueue.submit([&](handler& CGH){
CGH.depends_on(eventB);
CGH.parallel_for(...);
});
}
...
queue myQueue;
// USM allocations
const size_t n = 1024;
int *ptrA = malloc_device<int>(n, myQueue);
int *ptrB = malloc_device<int>(n, myQueue);
// Main graph which will be updated later
sycl_ext::command_graph mainGraph(myQueue);
// Record the kernels to mainGraph, using ptrA
mainGraph.begin_recording(myQueue);
run_kernels(ptrA, myQueue);
mainGraph.end_recording();
auto execMainGraph = mainGraph.finalize({sycl_ext::property::graph::updatable});
// Execute execMainGraph
myQueue.ext_oneapi_graph(execMainGraph);
// Record a second graph which records the same kernels, but using ptrB instead
sycl_ext::command_graph updateGraph(myQueue);
updateGraph.begin_recording(myQueue);
run_kernels(ptrB, myQueue);
updateGraph.end_recording();
// Update execMainGraph using updateGraph. We do not need to finalize
// updateGraph (this would be expensive)
execMainGraph.update(updateGraph);
// Execute execMainGraph again, which will now be operating on ptrB instead of
// ptrA
myQueue.ext_oneapi_graph(execMainGraph);
----
Detailed code examples and usage guidelines are provided in the
link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].

== Future Direction [[future-direction]]

Expand Down
1 change: 1 addition & 0 deletions sycl/doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Using oneAPI DPC++ for Application Development
User API Reference <https://intel.github.io/llvm-docs/doxygen/group__sycl__api.html>
EnvironmentVariables
MultiTileCardWithLevelZero
syclgraph/SYCLGraphUsageGuide

Design Documents for the oneAPI DPC++ Compiler
----------------------------------------------
Expand Down
Loading

0 comments on commit abb785a

Please sign in to comment.