Skip to content

Commit

Permalink
[SYCL][Graph] Update design doc for copy optimization and add test
Browse files Browse the repository at this point in the history
- Update UR tag to include L0 command-buffer copy engine optimization
- Add test which mixes copy and kernel commands
- Update design doc to detail copy engine optimization
  • Loading branch information
mfrancepillois authored and EwanC committed Jun 14, 2024
1 parent 73cf85d commit 0d13b58
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 8 deletions.
18 changes: 18 additions & 0 deletions sycl/doc/design/CommandGraph.md
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,24 @@ Level Zero:
Future work will include exploring L0 API extensions to improve the mapping of
UR command-buffer to L0 command-list.

#### Copy Engine

For performance considerations, the Unified Runtime Level Zero adapter uses
different Level Zero command-queues to submit compute kernels and memory
operations when the device has a dedicated copy engine. To take advantage of the
copy engine when available, the graph workload can also be split between memory
operations and compute kernels. To achieve this, two graph workload
command-lists live simultaneously in a command-buffer.

When the command-buffer is finalized, memory operations (e.g. buffer copy,
buffer fill, ...) are enqueued in the *copy* command-list while the other
commands are enqueued in the compute command-list. On submission, if not empty,
the *copy* command-list is sent to the main copy command-queue while the compute
command-list is sent to the compute command-queue.

Both are executed concurrently. Synchronization between the command-lists is
handled by Level Zero events.

### CUDA

The SYCL Graph CUDA backend relies on the
Expand Down
10 changes: 2 additions & 8 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
set(UNIFIED_RUNTIME_TAG b13c5e1f85e01fef7de7568835092f8592ded6e4)

fetch_adapter_source(level_zero
${UNIFIED_RUNTIME_REPO}
# commit 8788bd13cceb3f8e6338538b624652e6249a4543
# Merge: 78d02039 3f502d8f
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
# Date: Wed Jun 12 13:13:52 2024 +0100
# Merge pull request #1697 from againull/review/againull/l0_loader
# [L0] Add flexibility to change level zero repo
8788bd13cceb3f8e6338538b624652e6249a4543
"https://github.com/bensuo/unified-runtime.git"
"cmd-buf-copy-queue"
)

fetch_adapter_source(opencl
Expand Down
102 changes: 102 additions & 0 deletions sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//

// Tests that the optimization to use the L0 Copy Engine for memory commands
// does not interfere with the linear graph optimization

#include "../graph_common.hpp"

#include <sycl/properties/queue_properties.hpp>

int main() {
queue Queue{{sycl::property::queue::in_order{}}};

using T = int;

const T ModValue = 7;
std::vector<T> DataA(Size), DataB(Size), DataC(Size);

std::iota(DataA.begin(), DataA.end(), 1);
std::iota(DataB.begin(), DataB.end(), 10);
std::iota(DataC.begin(), DataC.end(), 1000);

// Create reference data for output
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
for (size_t i = 0; i < Iterations; i++) {
for (size_t j = 0; j < Size; j++) {
ReferenceA[j] += ModValue;
ReferenceB[j] = ReferenceA[j];
ReferenceB[j] -= ModValue;
ReferenceC[j] = ReferenceB[j];
ReferenceC[j] += ModValue;
}
}

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

Graph.begin_recording(Queue);
Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrA[LinID] += ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrB[LinID] -= ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrC[LinID] += ModValue;
});
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event =
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.copy(PtrA, DataA.data(), Size, Event);
Queue.copy(PtrB, DataB.data(), Size, Event);
Queue.copy(PtrC, DataC.data(), Size, Event);
Queue.wait_and_throw();

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

for (size_t i = 0; i < Size; i++) {
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
}
}

0 comments on commit 0d13b58

Please sign in to comment.