diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 2c177a8faf76d..e73a5498412df 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, unsigned KernelIndex = 0; ParamList FusedParams; PromotionMap PromotedAccs; - // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary - // resources (which contain reductions) and figure out how to fuse them. + // TODO(Lukas, ONNX-399): Collect information about streams and figure out how + // to fuse them. for (auto &RawCmd : InputKernels) { auto *KernelCmd = static_cast(RawCmd); auto &CG = KernelCmd->getCG(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index b3b4343b530ad..104c38dcdd99b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, auto FusedKernelCmd = std::make_unique(std::move(FusedCG), Queue); + // Inherit auxiliary resources from fused command groups + Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(), + PlaceholderCmd->getEvent()); assert(PlaceholderCmd->MDeps.empty()); // Next, backwards iterate over all the commands in the fusion list and remove // them from the graph to restore the state before starting fusion, so we can diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c4771a9ed8c59..a83298a628539 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG( for (const auto &StreamImplPtr : Streams) { StreamImplPtr->flush(NewEvent); } - - if (!AuxiliaryResources.empty()) - registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); } + if (!AuxiliaryResources.empty()) + registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); + return NewEvent; } @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { } } +static void registerAuxiliaryResourcesNoLock( + std::unordered_map>> + &AuxiliaryResources, + const EventImplPtr &Event, + std::vector> &&Resources) { + std::vector> &StoredResources = + AuxiliaryResources[Event]; + StoredResources.insert(StoredResources.end(), + std::make_move_iterator(Resources.begin()), + std::make_move_iterator(Resources.end())); +} + +void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst, + const EventImplPtr &Src) { + std::unique_lock Lock{MAuxiliaryResourcesMutex}; + auto Iter = MAuxiliaryResources.find(Src); + if (Iter == MAuxiliaryResources.end()) { + return; + } + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst, + std::move(Iter->second)); + MAuxiliaryResources.erase(Iter); +} + void Scheduler::registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources) { std::unique_lock Lock{MAuxiliaryResourcesMutex}; - MAuxiliaryResources.insert({Event, std::move(Resources)}); + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event, + std::move(Resources)); } void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 09ba43dbd1d4e..53ce295626045 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -543,6 +543,8 @@ class Scheduler { bool ShouldEnqueue; }; + /// Assign \p Src's auxiliary resources to \p Dst. + void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src); void registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources); void cleanupAuxiliaryResources(BlockingT Blocking); diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp index 3e2a3932224ed..6f70bcd5a4be9 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp @@ -1,59 +1,94 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu -// Test fusion works with reductions. +// Test fusion works with reductions. Some algorithms will lead to fusion being +// cancelled in some devices. These should work properly anyway. #include +#include #include "../helpers.hpp" +#include "sycl/detail/reduction_forward.hpp" using namespace sycl; -template class ReductionTest; +constexpr inline size_t globalSize = 512; -int main() { - constexpr size_t dataSize = 512; +template class ReductionTest; - int sumRes = -1; - int maxRes = -1; +template void test(nd_range<1> ndr) { + std::array data; + int sumRes = 0; + int maxRes = 0; { queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - buffer dataBuf{dataSize}; + buffer dataBuf{data}; buffer sumBuf{&sumRes, 1}; buffer maxBuf{&maxRes, 1}; ext::codeplay::experimental::fusion_wrapper fw{q}; - fw.start_fusion(); + fw.start_fusion(); iota(q, dataBuf, 0); q.submit([&](handler &cgh) { accessor in(dataBuf, cgh, read_only); auto sumRed = reduction(sumBuf, cgh, plus<>{}, property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); auto maxRed = reduction(maxBuf, cgh, maximum<>{}, property::reduction::initialize_to_identity{}); - cgh.parallel_for(dataSize, sumRed, maxRed, - [=](id<1> i, auto &sum, auto &max) { - sum.combine(in[i]); - max.combine(in[i]); - }); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); }); - complete_fusion_with_check( - fw, ext::codeplay::experimental::property::no_barriers{}); + fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{}); } - constexpr int expectedMax = dataSize - 1; - constexpr int expectedSum = dataSize * expectedMax / 2; + constexpr int expectedMax = globalSize - 1; + constexpr int expectedSum = globalSize * expectedMax / 2; - std::cerr << sumRes << "\n"; + assert(sumRes == expectedSum); + assert(maxRes == expectedMax); +} - assert(maxRes == expectedMax && "Unexpected max value"); - assert(sumRes == expectedSum && "Unexpected sum value"); +template +void test_strategies( + std::integer_sequence, + size_t localSize) { + ((test({globalSize, localSize})), ...); +} - return 0; +int main() { + constexpr std::array localSizes{ + globalSize /*Test single work-group*/, + globalSize / 32 /*Test middle-sized work-group*/, + 1 /*Test single item work-groups*/}; + for (size_t localSize : localSizes) { + test_strategies( + std::integer_sequence< + detail::reduction::strategy, + detail::reduction::strategy::group_reduce_and_last_wg_detection, + detail::reduction::strategy::local_atomic_and_atomic_cross_wg, + detail::reduction::strategy::range_basic, + detail::reduction::strategy::group_reduce_and_atomic_cross_wg, + detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg, + detail::reduction::strategy::group_reduce_and_multiple_kernels, + detail::reduction::strategy::basic, + detail::reduction::strategy::multi>{}, + localSize); + } }