diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 2c177a8faf76d..e73a5498412df 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   unsigned KernelIndex = 0;
   ParamList FusedParams;
   PromotionMap PromotedAccs;
-  // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
-  // resources (which contain reductions) and figure out how to fuse them.
+  // TODO(Lukas, ONNX-399): Collect information about streams and figure out how
+  // to fuse them.
   for (auto &RawCmd : InputKernels) {
     auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
     auto &CG = KernelCmd->getCG();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index b3b4343b530ad..104c38dcdd99b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
   auto FusedKernelCmd =
       std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
 
+  // Inherit auxiliary resources from fused command groups
+  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
+                                                  PlaceholderCmd->getEvent());
   assert(PlaceholderCmd->MDeps.empty());
   // Next, backwards iterate over all the commands in the fusion list and remove
   // them from the graph to restore the state before starting fusion, so we can
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index c4771a9ed8c59..a83298a628539 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
     for (const auto &StreamImplPtr : Streams) {
       StreamImplPtr->flush(NewEvent);
     }
-
-    if (!AuxiliaryResources.empty())
-      registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
   }
 
+  if (!AuxiliaryResources.empty())
+    registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
+
   return NewEvent;
 }
 
@@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
   }
 }
 
+static void registerAuxiliaryResourcesNoLock(
+    std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
+        &AuxiliaryResources,
+    const EventImplPtr &Event,
+    std::vector<std::shared_ptr<const void>> &&Resources) {
+  std::vector<std::shared_ptr<const void>> &StoredResources =
+      AuxiliaryResources[Event];
+  StoredResources.insert(StoredResources.end(),
+                         std::make_move_iterator(Resources.begin()),
+                         std::make_move_iterator(Resources.end()));
+}
+
+void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
+                                       const EventImplPtr &Src) {
+  std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
+  auto Iter = MAuxiliaryResources.find(Src);
+  if (Iter == MAuxiliaryResources.end()) {
+    return;
+  }
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
+                                   std::move(Iter->second));
+  MAuxiliaryResources.erase(Iter);
+}
+
 void Scheduler::registerAuxiliaryResources(
     EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
   std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
-  MAuxiliaryResources.insert({Event, std::move(Resources)});
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
+                                   std::move(Resources));
 }
 
 void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09ba43dbd1d4e..53ce295626045 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -543,6 +543,8 @@ class Scheduler {
     bool ShouldEnqueue;
   };
 
+  /// Assign \p Src's auxiliary resources to \p Dst.
+  void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
   void registerAuxiliaryResources(
       EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
   void cleanupAuxiliaryResources(BlockingT Blocking);
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
index 3e2a3932224ed..6f70bcd5a4be9 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
@@ -1,59 +1,94 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
 
-// Test fusion works with reductions.
+// Test fusion works with reductions. Some algorithms will lead to fusion being
+// cancelled in some devices. These should work properly anyway.
 
 #include <sycl/sycl.hpp>
+#include <utility>
 
 #include "../helpers.hpp"
+#include "sycl/detail/reduction_forward.hpp"
 
 using namespace sycl;
 
-template <typename BinaryOperation> class ReductionTest;
+constexpr inline size_t globalSize = 512;
 
-int main() {
-  constexpr size_t dataSize = 512;
+template <typename BinaryOperation> class ReductionTest;
 
-  int sumRes = -1;
-  int maxRes = -1;
+template <detail::reduction::strategy Strategy> void test(nd_range<1> ndr) {
+  std::array<int, globalSize> data;
+  int sumRes = 0;
+  int maxRes = 0;
 
   {
     queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-    buffer<int> dataBuf{dataSize};
+    buffer<int> dataBuf{data};
     buffer<int> sumBuf{&sumRes, 1};
     buffer<int> maxBuf{&maxRes, 1};
 
     ext::codeplay::experimental::fusion_wrapper fw{q};
-    fw.start_fusion();
 
+    fw.start_fusion();
     iota(q, dataBuf, 0);
 
     q.submit([&](handler &cgh) {
       accessor in(dataBuf, cgh, read_only);
       auto sumRed = reduction(sumBuf, cgh, plus<>{},
                               property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
       auto maxRed = reduction(maxBuf, cgh, maximum<>{},
                               property::reduction::initialize_to_identity{});
-      cgh.parallel_for(dataSize, sumRed, maxRed,
-                       [=](id<1> i, auto &sum, auto &max) {
-                         sum.combine(in[i]);
-                         max.combine(in[i]);
-                       });
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
     });
 
-    complete_fusion_with_check(
-        fw, ext::codeplay::experimental::property::no_barriers{});
+    fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
   }
 
-  constexpr int expectedMax = dataSize - 1;
-  constexpr int expectedSum = dataSize * expectedMax / 2;
+  constexpr int expectedMax = globalSize - 1;
+  constexpr int expectedSum = globalSize * expectedMax / 2;
 
-  std::cerr << sumRes << "\n";
+  assert(sumRes == expectedSum);
+  assert(maxRes == expectedMax);
+}
 
-  assert(maxRes == expectedMax && "Unexpected max value");
-  assert(sumRes == expectedSum && "Unexpected sum value");
+template <detail::reduction::strategy... strategies>
+void test_strategies(
+    std::integer_sequence<detail::reduction::strategy, strategies...>,
+    size_t localSize) {
+  ((test<strategies>({globalSize, localSize})), ...);
+}
 
-  return 0;
+int main() {
+  constexpr std::array<std::size_t, 3> localSizes{
+      globalSize /*Test single work-group*/,
+      globalSize / 32 /*Test middle-sized work-group*/,
+      1 /*Test single item work-groups*/};
+  for (size_t localSize : localSizes) {
+    test_strategies(
+        std::integer_sequence<
+            detail::reduction::strategy,
+            detail::reduction::strategy::group_reduce_and_last_wg_detection,
+            detail::reduction::strategy::local_atomic_and_atomic_cross_wg,
+            detail::reduction::strategy::range_basic,
+            detail::reduction::strategy::group_reduce_and_atomic_cross_wg,
+            detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg,
+            detail::reduction::strategy::group_reduce_and_multiple_kernels,
+            detail::reduction::strategy::basic,
+            detail::reduction::strategy::multi>{},
+        localSize);
+  }
 }