From 67cf773cbcb3f338537cec9a42c9584a9ced953a Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Fri, 2 Feb 2024 12:53:14 +0000 Subject: [PATCH] [SYCL][Fusion] Take auxiliary resources from fused command groups `KernelFusion/Reduction/reduction.cpp` was `XFAIL`ed in #1254 to avoid CI errors. This patch fixes that error by assigning auxiliary resources from each fused command group to the placeholder fusion event. This event will not be marked as completed after the fused reductions finish execution either if fusion is completed , cancelled or aborted. Test is updated to check every algorithm. This way we cover both cases (fusion taking and not taking place), as some algorithms will fail to fuse due to the nature of the command groups being launched (incompatible ND-ranges or incompatible command-group kinds). Signed-off-by: Victor Perez --- sycl/source/detail/jit_compiler.cpp | 4 +- .../source/detail/scheduler/graph_builder.cpp | 3 + sycl/source/detail/scheduler/scheduler.cpp | 33 +++++++- sycl/source/detail/scheduler/scheduler.hpp | 2 + .../KernelFusion/Reduction/reduction.cpp | 79 +++++++++++++------ 5 files changed, 93 insertions(+), 28 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 2c177a8faf76d..e73a5498412df 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, unsigned KernelIndex = 0; ParamList FusedParams; PromotionMap PromotedAccs; - // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary - // resources (which contain reductions) and figure out how to fuse them. + // TODO(Lukas, ONNX-399): Collect information about streams and figure out how + // to fuse them. for (auto &RawCmd : InputKernels) { auto *KernelCmd = static_cast(RawCmd); auto &CG = KernelCmd->getCG(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index b3b4343b530ad..104c38dcdd99b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, auto FusedKernelCmd = std::make_unique(std::move(FusedCG), Queue); + // Inherit auxiliary resources from fused command groups + Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(), + PlaceholderCmd->getEvent()); assert(PlaceholderCmd->MDeps.empty()); // Next, backwards iterate over all the commands in the fusion list and remove // them from the graph to restore the state before starting fusion, so we can diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c4771a9ed8c59..a83298a628539 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG( for (const auto &StreamImplPtr : Streams) { StreamImplPtr->flush(NewEvent); } - - if (!AuxiliaryResources.empty()) - registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); } + if (!AuxiliaryResources.empty()) + registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); + return NewEvent; } @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { } } +static void registerAuxiliaryResourcesNoLock( + std::unordered_map>> + &AuxiliaryResources, + const EventImplPtr &Event, + std::vector> &&Resources) { + std::vector> &StoredResources = + AuxiliaryResources[Event]; + StoredResources.insert(StoredResources.end(), + std::make_move_iterator(Resources.begin()), + std::make_move_iterator(Resources.end())); +} + +void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst, + const EventImplPtr &Src) { + std::unique_lock Lock{MAuxiliaryResourcesMutex}; + auto Iter = MAuxiliaryResources.find(Src); + if (Iter == MAuxiliaryResources.end()) { + return; + } + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst, + std::move(Iter->second)); + MAuxiliaryResources.erase(Iter); +} + void Scheduler::registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources) { std::unique_lock Lock{MAuxiliaryResourcesMutex}; - MAuxiliaryResources.insert({Event, std::move(Resources)}); + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event, + std::move(Resources)); } void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 09ba43dbd1d4e..53ce295626045 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -543,6 +543,8 @@ class Scheduler { bool ShouldEnqueue; }; + /// Assign \p Src's auxiliary resources to \p Dst. + void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src); void registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources); void cleanupAuxiliaryResources(BlockingT Blocking); diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp index 3e2a3932224ed..6f70bcd5a4be9 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp @@ -1,59 +1,94 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu -// Test fusion works with reductions. +// Test fusion works with reductions. Some algorithms will lead to fusion being +// cancelled in some devices. These should work properly anyway. #include +#include #include "../helpers.hpp" +#include "sycl/detail/reduction_forward.hpp" using namespace sycl; -template class ReductionTest; +constexpr inline size_t globalSize = 512; -int main() { - constexpr size_t dataSize = 512; +template class ReductionTest; - int sumRes = -1; - int maxRes = -1; +template void test(nd_range<1> ndr) { + std::array data; + int sumRes = 0; + int maxRes = 0; { queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - buffer dataBuf{dataSize}; + buffer dataBuf{data}; buffer sumBuf{&sumRes, 1}; buffer maxBuf{&maxRes, 1}; ext::codeplay::experimental::fusion_wrapper fw{q}; - fw.start_fusion(); + fw.start_fusion(); iota(q, dataBuf, 0); q.submit([&](handler &cgh) { accessor in(dataBuf, cgh, read_only); auto sumRed = reduction(sumBuf, cgh, plus<>{}, property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); auto maxRed = reduction(maxBuf, cgh, maximum<>{}, property::reduction::initialize_to_identity{}); - cgh.parallel_for(dataSize, sumRed, maxRed, - [=](id<1> i, auto &sum, auto &max) { - sum.combine(in[i]); - max.combine(in[i]); - }); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); }); - complete_fusion_with_check( - fw, ext::codeplay::experimental::property::no_barriers{}); + fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{}); } - constexpr int expectedMax = dataSize - 1; - constexpr int expectedSum = dataSize * expectedMax / 2; + constexpr int expectedMax = globalSize - 1; + constexpr int expectedSum = globalSize * expectedMax / 2; - std::cerr << sumRes << "\n"; + assert(sumRes == expectedSum); + assert(maxRes == expectedMax); +} - assert(maxRes == expectedMax && "Unexpected max value"); - assert(sumRes == expectedSum && "Unexpected sum value"); +template +void test_strategies( + std::integer_sequence, + size_t localSize) { + ((test({globalSize, localSize})), ...); +} - return 0; +int main() { + constexpr std::array localSizes{ + globalSize /*Test single work-group*/, + globalSize / 32 /*Test middle-sized work-group*/, + 1 /*Test single item work-groups*/}; + for (size_t localSize : localSizes) { + test_strategies( + std::integer_sequence< + detail::reduction::strategy, + detail::reduction::strategy::group_reduce_and_last_wg_detection, + detail::reduction::strategy::local_atomic_and_atomic_cross_wg, + detail::reduction::strategy::range_basic, + detail::reduction::strategy::group_reduce_and_atomic_cross_wg, + detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg, + detail::reduction::strategy::group_reduce_and_multiple_kernels, + detail::reduction::strategy::basic, + detail::reduction::strategy::multi>{}, + localSize); + } }