Skip to content

Commit

Permalink
[SYCL][Fusion] Take auxiliary resources from fused command groups
Browse files Browse the repository at this point in the history
`KernelFusion/Reduction/reduction.cpp` was `XFAIL`ed in #1254 to avoid
CI errors. This patch fixes that error by assigning auxiliary
resources from each fused command group to the placeholder fusion
event.

This event will not be marked as completed after the fused reductions
finish execution either if fusion is completed , cancelled or aborted.

Test is updated to check every algorithm. This way we cover both
cases (fusion taking and not taking place), as some algorithms will
fail to fuse due to the nature of the command groups being
launched (incompatible ND-ranges or incompatible command-group kinds).

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
  • Loading branch information
victor-eds committed Feb 2, 2024
1 parent 4a510b6 commit 67cf773
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 28 deletions.
4 changes: 2 additions & 2 deletions sycl/source/detail/jit_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
unsigned KernelIndex = 0;
ParamList FusedParams;
PromotionMap PromotedAccs;
// TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
// resources (which contain reductions) and figure out how to fuse them.
// TODO(Lukas, ONNX-399): Collect information about streams and figure out how
// to fuse them.
for (auto &RawCmd : InputKernels) {
auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
auto &CG = KernelCmd->getCG();
Expand Down
3 changes: 3 additions & 0 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
auto FusedKernelCmd =
std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);

// Inherit auxiliary resources from fused command groups
Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
PlaceholderCmd->getEvent());
assert(PlaceholderCmd->MDeps.empty());
// Next, backwards iterate over all the commands in the fusion list and remove
// them from the graph to restore the state before starting fusion, so we can
Expand Down
33 changes: 29 additions & 4 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
for (const auto &StreamImplPtr : Streams) {
StreamImplPtr->flush(NewEvent);
}

if (!AuxiliaryResources.empty())
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
}

if (!AuxiliaryResources.empty())
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));

return NewEvent;
}

Expand Down Expand Up @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
}
}

static void registerAuxiliaryResourcesNoLock(
std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
&AuxiliaryResources,
const EventImplPtr &Event,
std::vector<std::shared_ptr<const void>> &&Resources) {
std::vector<std::shared_ptr<const void>> &StoredResources =
AuxiliaryResources[Event];
StoredResources.insert(StoredResources.end(),
std::make_move_iterator(Resources.begin()),
std::make_move_iterator(Resources.end()));
}

void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
const EventImplPtr &Src) {
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
auto Iter = MAuxiliaryResources.find(Src);
if (Iter == MAuxiliaryResources.end()) {
return;
}
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
std::move(Iter->second));
MAuxiliaryResources.erase(Iter);
}

void Scheduler::registerAuxiliaryResources(
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
MAuxiliaryResources.insert({Event, std::move(Resources)});
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
std::move(Resources));
}

void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ class Scheduler {
bool ShouldEnqueue;
};

/// Assign \p Src's auxiliary resources to \p Dst.
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
void registerAuxiliaryResources(
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
void cleanupAuxiliaryResources(BlockingT Blocking);
Expand Down
79 changes: 57 additions & 22 deletions sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
Original file line number Diff line number Diff line change
@@ -1,59 +1,94 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out
// XFAIL: cpu

// Test fusion works with reductions.
// Test fusion works with reductions. Some algorithms will lead to fusion being
// cancelled in some devices. These should work properly anyway.

#include <sycl/sycl.hpp>
#include <utility>

#include "../helpers.hpp"
#include "sycl/detail/reduction_forward.hpp"

using namespace sycl;

template <typename BinaryOperation> class ReductionTest;
constexpr inline size_t globalSize = 512;

int main() {
constexpr size_t dataSize = 512;
template <typename BinaryOperation> class ReductionTest;

int sumRes = -1;
int maxRes = -1;
template <detail::reduction::strategy Strategy> void test(nd_range<1> ndr) {
std::array<int, globalSize> data;
int sumRes = 0;
int maxRes = 0;

{
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

buffer<int> dataBuf{dataSize};
buffer<int> dataBuf{data};
buffer<int> sumBuf{&sumRes, 1};
buffer<int> maxBuf{&maxRes, 1};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

fw.start_fusion();
iota(q, dataBuf, 0);

q.submit([&](handler &cgh) {
accessor in(dataBuf, cgh, read_only);
auto sumRed = reduction(sumBuf, cgh, plus<>{},
property::reduction::initialize_to_identity{});
detail::reduction_parallel_for<detail::auto_name, Strategy>(
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
[=](nd_item<1> Item, auto &Red) {
Red.combine(in[Item.get_global_id()]);
});
});

q.submit([&](handler &cgh) {
accessor in(dataBuf, cgh, read_only);
auto maxRed = reduction(maxBuf, cgh, maximum<>{},
property::reduction::initialize_to_identity{});
cgh.parallel_for(dataSize, sumRed, maxRed,
[=](id<1> i, auto &sum, auto &max) {
sum.combine(in[i]);
max.combine(in[i]);
});
detail::reduction_parallel_for<detail::auto_name, Strategy>(
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
[=](nd_item<1> Item, auto &Red) {
Red.combine(in[Item.get_global_id()]);
});
});

complete_fusion_with_check(
fw, ext::codeplay::experimental::property::no_barriers{});
fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
}

constexpr int expectedMax = dataSize - 1;
constexpr int expectedSum = dataSize * expectedMax / 2;
constexpr int expectedMax = globalSize - 1;
constexpr int expectedSum = globalSize * expectedMax / 2;

std::cerr << sumRes << "\n";
assert(sumRes == expectedSum);
assert(maxRes == expectedMax);
}

assert(maxRes == expectedMax && "Unexpected max value");
assert(sumRes == expectedSum && "Unexpected sum value");
template <detail::reduction::strategy... strategies>
void test_strategies(
std::integer_sequence<detail::reduction::strategy, strategies...>,
size_t localSize) {
((test<strategies>({globalSize, localSize})), ...);
}

return 0;
int main() {
constexpr std::array<std::size_t, 3> localSizes{
globalSize /*Test single work-group*/,
globalSize / 32 /*Test middle-sized work-group*/,
1 /*Test single item work-groups*/};
for (size_t localSize : localSizes) {
test_strategies(
std::integer_sequence<
detail::reduction::strategy,
detail::reduction::strategy::group_reduce_and_last_wg_detection,
detail::reduction::strategy::local_atomic_and_atomic_cross_wg,
detail::reduction::strategy::range_basic,
detail::reduction::strategy::group_reduce_and_atomic_cross_wg,
detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg,
detail::reduction::strategy::group_reduce_and_multiple_kernels,
detail::reduction::strategy::basic,
detail::reduction::strategy::multi>{},
localSize);
}
}

0 comments on commit 67cf773

Please sign in to comment.