Skip to content

Commit

Permalink
[SYCL][Graph] Fix memset queue shortcut when queue is recorded (#329)
Browse files Browse the repository at this point in the history
Memset queue shortcut `queue::memset()` manages the memset direclty from the host (without going through the normal path, i.e. the handler).
We added a specific case when the queue is recorded to use the normal path instead of the optimized path.
  • Loading branch information
mfrancepillois authored Sep 29, 2023
1 parent 11feebb commit 25ff8ef
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
11 changes: 11 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
// Emit a begin/end scope for this call
PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
#endif
// If we have a command graph set we need to capture the memset through normal
// queue submission rather than execute the memset directly.
if (MGraph.lock()) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.memset(Ptr, Value, Count);
},
Self, {});
}

if (MHasDiscardEventsSupport) {
MemoryManager::fill_usm(Ptr, Self, Count, Value,
getOrWaitEvents(DepEvents, MContext), nullptr);
Expand Down
47 changes: 47 additions & 0 deletions sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// REQUIRES: cuda || level_zero, gpu
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using ZE_DEBUG
// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
//
// CHECK-NOT: LEAK
//
// Tests adding a USM memset queue shortcut operation as a graph node.

#include "../graph_common.hpp"

int main() {

queue Queue;

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

const size_t N = 10;
unsigned char *Arr = malloc_device<unsigned char>(N, Queue);

int Value = 77;
Graph.begin_recording(Queue);
auto Init = Queue.memset(Arr, Value, N);
Queue.submit([&](handler &CGH) {
CGH.depends_on(Init);
CGH.single_task<class double_dest>([=]() {
for (int i = 0; i < Size; i++)
Arr[i] = 2 * Arr[i];
});
});

Graph.end_recording();

auto ExecGraph = Graph.finalize();

Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();

std::vector<unsigned char> Output(N);
Queue.memcpy(Output.data(), Arr, N).wait();
for (int i = 0; i < N; i++)
assert(Output[i] == (Value * 2));

sycl::free(Arr, Queue);

return 0;
}

0 comments on commit 25ff8ef

Please sign in to comment.