From 67cf773cbcb3f338537cec9a42c9584a9ced953a Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Fri, 2 Feb 2024 12:53:14 +0000
Subject: [PATCH] [SYCL][Fusion] Take auxiliary resources from fused command
 groups

`KernelFusion/Reduction/reduction.cpp` was `XFAIL`ed in #1254 to avoid
CI errors. This patch fixes that error by assigning auxiliary
resources from each fused command group to the placeholder fusion
event.

This event will not be marked as completed after the fused reductions
finish execution either if fusion is completed , cancelled or aborted.

Test is updated to check every algorithm. This way we cover both
cases (fusion taking and not taking place), as some algorithms will
fail to fuse due to the nature of the command groups being
launched (incompatible ND-ranges or incompatible command-group kinds).

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp           |  4 +-
 .../source/detail/scheduler/graph_builder.cpp |  3 +
 sycl/source/detail/scheduler/scheduler.cpp    | 33 +++++++-
 sycl/source/detail/scheduler/scheduler.hpp    |  2 +
 .../KernelFusion/Reduction/reduction.cpp      | 79 +++++++++++++------
 5 files changed, 93 insertions(+), 28 deletions(-)
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 2c177a8faf76d..e73a5498412df 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   unsigned KernelIndex = 0;
   ParamList FusedParams;
   PromotionMap PromotedAccs;
-  // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
-  // resources (which contain reductions) and figure out how to fuse them.
+  // TODO(Lukas, ONNX-399): Collect information about streams and figure out how
+  // to fuse them.
   for (auto &RawCmd : InputKernels) {
     auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
     auto &CG = KernelCmd->getCG();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index b3b4343b530ad..104c38dcdd99b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
   auto FusedKernelCmd =
       std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
 
+  // Inherit auxiliary resources from fused command groups
+  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
+                                                  PlaceholderCmd->getEvent());
   assert(PlaceholderCmd->MDeps.empty());
   // Next, backwards iterate over all the commands in the fusion list and remove
   // them from the graph to restore the state before starting fusion, so we can
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index c4771a9ed8c59..a83298a628539 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
     for (const auto &StreamImplPtr : Streams) {
       StreamImplPtr->flush(NewEvent);
     }
-
-    if (!AuxiliaryResources.empty())
-      registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
   }
 
+  if (!AuxiliaryResources.empty())
+    registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
+
   return NewEvent;
 }
 
@@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
   }
 }
 
+static void registerAuxiliaryResourcesNoLock(
+    std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
+        &AuxiliaryResources,
+    const EventImplPtr &Event,
+    std::vector<std::shared_ptr<const void>> &&Resources) {
+  std::vector<std::shared_ptr<const void>> &StoredResources =
+      AuxiliaryResources[Event];
+  StoredResources.insert(StoredResources.end(),
+                         std::make_move_iterator(Resources.begin()),
+                         std::make_move_iterator(Resources.end()));
+}
+
+void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
+                                       const EventImplPtr &Src) {
+  std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
+  auto Iter = MAuxiliaryResources.find(Src);
+  if (Iter == MAuxiliaryResources.end()) {
+    return;
+  }
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
+                                   std::move(Iter->second));
+  MAuxiliaryResources.erase(Iter);
+}
+
 void Scheduler::registerAuxiliaryResources(
     EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
   std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
-  MAuxiliaryResources.insert({Event, std::move(Resources)});
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
+                                   std::move(Resources));
 }
 
 void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09ba43dbd1d4e..53ce295626045 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -543,6 +543,8 @@ class Scheduler {
     bool ShouldEnqueue;
   };
 
+  /// Assign \p Src's auxiliary resources to \p Dst.
+  void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
   void registerAuxiliaryResources(
       EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
   void cleanupAuxiliaryResources(BlockingT Blocking);
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
index 3e2a3932224ed..6f70bcd5a4be9 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
@@ -1,59 +1,94 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
 
-// Test fusion works with reductions.
+// Test fusion works with reductions. Some algorithms will lead to fusion being
+// cancelled in some devices. These should work properly anyway.
 
 #include <sycl/sycl.hpp>
+#include <utility>
 
 #include "../helpers.hpp"
+#include "sycl/detail/reduction_forward.hpp"
 
 using namespace sycl;
 
-template <typename BinaryOperation> class ReductionTest;
+constexpr inline size_t globalSize = 512;
 
-int main() {
-  constexpr size_t dataSize = 512;
+template <typename BinaryOperation> class ReductionTest;
 
-  int sumRes = -1;
-  int maxRes = -1;
+template <detail::reduction::strategy Strategy> void test(nd_range<1> ndr) {
+  std::array<int, globalSize> data;
+  int sumRes = 0;
+  int maxRes = 0;
 
   {
     queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-    buffer<int> dataBuf{dataSize};
+    buffer<int> dataBuf{data};
     buffer<int> sumBuf{&sumRes, 1};
     buffer<int> maxBuf{&maxRes, 1};
 
     ext::codeplay::experimental::fusion_wrapper fw{q};
-    fw.start_fusion();
 
+    fw.start_fusion();
     iota(q, dataBuf, 0);
 
     q.submit([&](handler &cgh) {
       accessor in(dataBuf, cgh, read_only);
       auto sumRed = reduction(sumBuf, cgh, plus<>{},
                               property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
       auto maxRed = reduction(maxBuf, cgh, maximum<>{},
                               property::reduction::initialize_to_identity{});
-      cgh.parallel_for(dataSize, sumRed, maxRed,
-                       [=](id<1> i, auto &sum, auto &max) {
-                         sum.combine(in[i]);
-                         max.combine(in[i]);
-                       });
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
     });
 
-    complete_fusion_with_check(
-        fw, ext::codeplay::experimental::property::no_barriers{});
+    fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
   }
 
-  constexpr int expectedMax = dataSize - 1;
-  constexpr int expectedSum = dataSize * expectedMax / 2;
+  constexpr int expectedMax = globalSize - 1;
+  constexpr int expectedSum = globalSize * expectedMax / 2;
 
-  std::cerr << sumRes << "\n";
+  assert(sumRes == expectedSum);
+  assert(maxRes == expectedMax);
+}
 
-  assert(maxRes == expectedMax && "Unexpected max value");
-  assert(sumRes == expectedSum && "Unexpected sum value");
+template <detail::reduction::strategy... strategies>
+void test_strategies(
+    std::integer_sequence<detail::reduction::strategy, strategies...>,
+    size_t localSize) {
+  ((test<strategies>({globalSize, localSize})), ...);
+}
 
-  return 0;
+int main() {
+  constexpr std::array<std::size_t, 3> localSizes{
+      globalSize /*Test single work-group*/,
+      globalSize / 32 /*Test middle-sized work-group*/,
+      1 /*Test single item work-groups*/};
+  for (size_t localSize : localSizes) {
+    test_strategies(
+        std::integer_sequence<
+            detail::reduction::strategy,
+            detail::reduction::strategy::group_reduce_and_last_wg_detection,
+            detail::reduction::strategy::local_atomic_and_atomic_cross_wg,
+            detail::reduction::strategy::range_basic,
+            detail::reduction::strategy::group_reduce_and_atomic_cross_wg,
+            detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg,
+            detail::reduction::strategy::group_reduce_and_multiple_kernels,
+            detail::reduction::strategy::basic,
+            detail::reduction::strategy::multi>{},
+        localSize);
+  }
 }