From 875d93b2d0a324228f80c2c0b74bf5bddb4e3a0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Wed, 6 Mar 2024 10:27:16 +0100
Subject: [PATCH 01/19] Rework allocator namespace 1

---
 include/aggregation_manager.hpp               |  6 +-
 ...l.hpp => aligned_recycling_allocators.hpp} | 10 +--
 ...util.hpp => cuda_recycling_allocators.hpp} | 39 +++++-------
 .../buffer_recycler.hpp}                      | 63 +++++++++----------
 include/detail/config.hpp                     |  4 +-
 ..._util.hpp => hip_recycling_allocators.hpp} | 35 ++++++-----
 ...fer_util.hpp => recycling_kokkos_view.hpp} | 54 ++++++++--------
 ...util.hpp => sycl_recycling_allocators.hpp} | 12 ++--
 tests/allocator_aligned_test.cpp              | 18 +++---
 tests/allocator_hpx_test.cpp                  | 22 +++----
 tests/allocator_test.cpp                      | 16 ++---
 11 files changed, 137 insertions(+), 142 deletions(-)
 rename include/{aligned_buffer_util.hpp => aligned_recycling_allocators.hpp} (85%)
 rename include/{cuda_buffer_util.hpp => cuda_recycling_allocators.hpp} (96%)
 rename include/{buffer_manager.hpp => detail/buffer_recycler.hpp} (95%)
 rename include/{hip_buffer_util.hpp => hip_recycling_allocators.hpp} (96%)
 rename include/{kokkos_buffer_util.hpp => recycling_kokkos_view.hpp} (76%)
 rename include/{sycl_buffer_util.hpp => sycl_recycling_allocators.hpp} (93%)
diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 2aa92063..cd1ca74b 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -592,9 +592,9 @@ template <typename Executor> class Aggregated_Executor {
 #endif
         // Get shiny and new buffer that will be shared between all slices
         // Buffer might be recycled from previous allocations by the
-        // buffer_recycler...
+        // buffer_interface...
         T *aggregated_buffer =
-            recycler::detail::buffer_recycler::get<T, Host_Allocator>(
+            recycler::detail::buffer_interface::get<T, Host_Allocator>(
                 size, manage_content_lifetime, location_id, gpu_id);
         // Create buffer entry for this buffer
         buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
@@ -670,7 +670,7 @@ template <typename Executor> class Aggregated_Executor {
       // it as invalid)
       if (valid) {
         assert(buffers_in_use == true);
-        recycler::detail::buffer_recycler::mark_unused<T, Host_Allocator>(
+        recycler::detail::buffer_interface::mark_unused<T, Host_Allocator>(
             buffer_pointer, buffer_size, location_id, gpu_id);
         // mark buffer as invalid to prevent any other slice from marking the
         // buffer as unused
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_recycling_allocators.hpp
similarity index 85%
rename from include/aligned_buffer_util.hpp
rename to include/aligned_recycling_allocators.hpp
index d36a994a..b1ed5dce 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_recycling_allocators.hpp
@@ -3,13 +3,13 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef ALIGNED_BUFFER_UTIL_HPP
-#define ALIGNED_BUFFER_UTIL_HPP
+#ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP
+#define ALIGNED_RECYCLING_ALLOCATORS_HPP
 
-#include "buffer_manager.hpp"
+#include "detail/buffer_recycler.hpp"
 #include <boost/align/aligned_allocator.hpp>
 
-namespace recycler {
+namespace cppuddle {
 namespace device_selection {
 template <typename T, size_t alignement>
 struct select_device_functor<
@@ -26,6 +26,6 @@ template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_aligned = detail::aggressive_recycle_allocator<
     T, boost::alignment::aligned_allocator<T, alignement>>;
-} // namespace recycler
+} // namespace cppuddle
 
 #endif
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_recycling_allocators.hpp
similarity index 96%
rename from include/cuda_buffer_util.hpp
rename to include/cuda_recycling_allocators.hpp
index 55d3397a..2452a563 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_recycling_allocators.hpp
@@ -1,24 +1,21 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef CUDA_BUFFER_UTIL_HPP
-#define CUDA_BUFFER_UTIL_HPP
+#ifndef CUDA_RECYCLING_ALLOCATORS_HPP
+#define CUDA_RECYCLING_ALLOCATORS_HPP
 
-#include "buffer_manager.hpp"
+#include "detail/buffer_recycler.hpp"
 #include "detail/config.hpp"
 
 #include <cuda_runtime.h>
 #include <stdexcept>
 #include <string>
 
-namespace recycler {
-
+namespace cppuddle {
 namespace detail {
 
-
-
 template <class T> struct cuda_pinned_allocator {
   using value_type = T;
   cuda_pinned_allocator() noexcept = default;
@@ -98,9 +95,19 @@ constexpr bool operator!=(cuda_device_allocator<T> const &,
                           cuda_device_allocator<U> const &) noexcept {
   return false;
 }
+} // end namespace detail
 
 
-} // end namespace detail
+namespace device_selection {
+template <typename T>
+struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
+};
+template <typename T>
+struct select_device_functor<T, detail::cuda_device_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
+};
+} // namespace device_selection
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_host =
@@ -154,17 +161,5 @@ struct cuda_aggregated_device_buffer {
   Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
                          // for the entire lifetime of this buffer
 };
-
-namespace device_selection {
-template <typename T>
-struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
-};
-template <typename T>
-struct select_device_functor<T, detail::cuda_device_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
-};
-} // namespace device_selection
-
-} // end namespace recycler
+} // end namespace cppuddle
 #endif
diff --git a/include/buffer_manager.hpp b/include/detail/buffer_recycler.hpp
similarity index 95%
rename from include/buffer_manager.hpp
rename to include/detail/buffer_recycler.hpp
index 92a5f46b..3ad739d8 100644
--- a/include/buffer_manager.hpp
+++ b/include/detail/buffer_recycler.hpp
@@ -42,9 +42,9 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #endif
 #endif
 
-#include "../include/detail/config.hpp"
+#include "config.hpp"
 
-namespace recycler {
+namespace cppuddle {
 
 namespace device_selection {
 template <typename T, typename Allocator> struct select_device_functor {
@@ -63,7 +63,7 @@ template <typename T> struct select_device_functor<T, std::allocator<T>> {
 namespace detail {
 
 
-class buffer_recycler {
+class buffer_interface {
 public:
 #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
 
@@ -172,8 +172,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
 private:
 
   /// Singleton instance access
-  static buffer_recycler& instance() {
-    static buffer_recycler singleton{};
+  static buffer_interface& instance() {
+    static buffer_interface singleton{};
     return singleton;
   }
   /// Callbacks for printing the performance counter data
@@ -189,7 +189,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
   std::list<std::function<void()>> partial_cleanup_callbacks;
   /// default, private constructor - not automatically constructed due to the
   /// deleted constructors
-  buffer_recycler() = default;
+  buffer_interface() = default;
 
   mutex_t callback_protection_mut;
   /// Add a callback function that gets executed upon cleanup and destruction
@@ -217,7 +217,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
   }
 
 public:
-  ~buffer_recycler() = default; 
+  ~buffer_interface() = default; 
 
   // Subclasses
 private:
@@ -408,7 +408,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
 
       // No unused buffer found -> Create new one and return it
       try {
-        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
+        cppuddle::device_selection::select_device_functor<T, Host_Allocator>{}(
             device_id);
         Host_Allocator alloc;
         T *buffer = alloc.allocate(number_of_elements);
@@ -428,13 +428,13 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
         std::cerr 
           << "Not enough memory left. Cleaning up unused buffers now..." 
           << std::endl;
-        buffer_recycler::clean_unused_buffers();
+        buffer_interface::clean_unused_buffers();
         std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
 
         // If there still isn't enough memory left, the caller has to handle it
         // We've done all we can in here
         Host_Allocator alloc;
-        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
+        cppuddle::device_selection::select_device_functor<T, Host_Allocator>{}(
             device_id);
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
@@ -649,13 +649,13 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
       std::call_once(flag, []() {
 #endif
         is_finalized = false;
-        buffer_recycler::add_total_cleanup_callback(clean);
-        buffer_recycler::add_partial_cleanup_callback(
+        buffer_interface::add_total_cleanup_callback(clean);
+        buffer_interface::add_partial_cleanup_callback(
             clean_unused_buffers_only);
-        buffer_recycler::add_finalize_callback(
+        buffer_interface::add_finalize_callback(
             finalize);
 #ifdef CPPUDDLE_HAVE_COUNTERS
-        buffer_recycler::add_print_callback(
+        buffer_interface::add_print_callback(
             print_performance_counters);
 #endif
           });
@@ -753,10 +753,10 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
 public:
   // Putting deleted constructors in public gives more useful error messages
   // Bunch of constructors we don't need
-  buffer_recycler(buffer_recycler const &other) = delete;
-  buffer_recycler& operator=(buffer_recycler const &other) = delete;
-  buffer_recycler(buffer_recycler &&other) = delete;
-  buffer_recycler& operator=(buffer_recycler &&other) = delete;
+  buffer_interface(buffer_interface const &other) = delete;
+  buffer_interface& operator=(buffer_interface const &other) = delete;
+  buffer_interface(buffer_interface &&other) = delete;
+  buffer_interface& operator=(buffer_interface &&other) = delete;
 };
 
 template <typename T, typename Host_Allocator> struct recycle_allocator {
@@ -775,11 +775,11 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
       recycle_allocator<T, Host_Allocator> const &other) noexcept
       : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(n);
+    T *data = buffer_interface::get<T, Host_Allocator>(n);
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n);
   }
 #else
   recycle_allocator() noexcept
@@ -792,12 +792,12 @@ template <typename T, typename Host_Allocator> struct recycle_allocator {
       recycle_allocator<T, Host_Allocator> const &other) noexcept
   : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
   T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
+    T *data = buffer_interface::get<T, Host_Allocator>(
         n, false, hpx::get_worker_thread_num() % number_instances, device_id);
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
                                                     device_id);
   }
 #endif
@@ -845,12 +845,12 @@ struct aggressive_recycle_allocator {
       aggressive_recycle_allocator<T, Host_Allocator> const &) noexcept 
   : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
   T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
+    T *data = buffer_interface::get<T, Host_Allocator>(
         n, true); // also initializes the buffer if it isn't reused
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n);
   }
 #else
   aggressive_recycle_allocator() noexcept
@@ -863,13 +863,13 @@ struct aggressive_recycle_allocator {
       recycle_allocator<T, Host_Allocator> const &other) noexcept 
     : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
   T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
+    T *data = buffer_interface::get<T, Host_Allocator>(
         n, true, dealloc_hint, device_id); // also initializes the buffer
                                                 // if it isn't reused
     return data;
   }
   void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
                                                     device_id);
   }
 #endif
@@ -914,7 +914,6 @@ operator!=(aggressive_recycle_allocator<T, Host_Allocator> const &,
   else 
     return true;
 }
-
 } // namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
@@ -923,16 +922,16 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_std =
     detail::aggressive_recycle_allocator<T, std::allocator<T>>;
 
-inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); }
+inline void print_performance_counters() { detail::buffer_interface::print_performance_counters(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself
-inline void force_cleanup() { detail::buffer_recycler::clean_all(); }
+inline void force_cleanup() { detail::buffer_interface::clean_all(); }
 /// Deletes all buffers currently marked as unused
-inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); }
+inline void cleanup() { detail::buffer_interface::clean_unused_buffers(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself. Disallows further usage.
-inline void finalize() { detail::buffer_recycler::finalize(); }
+inline void finalize() { detail::buffer_interface::finalize(); }
 
-} // end namespace recycler
+} // end namespace cppuddle
 
 #endif
diff --git a/include/detail/config.hpp b/include/detail/config.hpp
index 2a06b1af..7115c790 100644
--- a/include/detail/config.hpp
+++ b/include/detail/config.hpp
@@ -28,7 +28,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #endif
 #endif
 
-namespace recycler {
+namespace cppuddle {
 
 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
 using mutex_t = hpx::spinlock_no_backoff;
@@ -67,6 +67,6 @@ inline size_t get_device_id(const size_t number_gpus) {
 #endif
 }
 
-} // end namespace recycler
+} // end namespace cppuddle
 
 #endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_recycling_allocators.hpp
similarity index 96%
rename from include/hip_buffer_util.hpp
rename to include/hip_recycling_allocators.hpp
index e2364095..465bd5fe 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_recycling_allocators.hpp
@@ -3,16 +3,17 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef HIP_BUFFER_UTIL_HPP
-#define HIP_BUFFER_UTIL_HPP
+#ifndef HIP_RECYCLING_ALLOCATORS_HPP
+#define HIP_RECYCLING_ALLOCATORS_HPP
 
-#include "buffer_manager.hpp"
+#include "detail/buffer_recycler.hpp"
+#include "detail/config.hpp"
 
 #include <hip/hip_runtime.h>
 #include <stdexcept>
 #include <string>
 
-namespace recycler {
+namespace cppuddle {
 
 namespace detail {
 
@@ -103,6 +104,18 @@ constexpr bool operator!=(hip_device_allocator<T> const &,
 
 } // end namespace detail
 
+
+namespace device_selection {
+template <typename T>
+struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+template <typename T>
+struct select_device_functor<T, detail::hip_device_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+} // namespace device_selection
+
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_host =
     detail::aggressive_recycle_allocator<T, detail::hip_pinned_allocator<T>>;
@@ -110,7 +123,6 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_device =
     detail::recycle_allocator<T, detail::hip_device_allocator<T>>;
 
-// TODO Is this even required? (cuda version should work fine...)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct hip_device_buffer {
   recycle_allocator_hip_device<T> allocator;
@@ -157,16 +169,5 @@ struct hip_aggregated_device_buffer {
                          // for the entire lifetime of this buffer
 };
 
-namespace device_selection {
-template <typename T>
-struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
-  void operator()(const size_t device_id) { hipSetDevice(device_id); }
-};
-template <typename T>
-struct select_device_functor<T, detail::hip_device_allocator<T>> {
-  void operator()(const size_t device_id) { hipSetDevice(device_id); }
-};
-} // namespace device_selection
-
-} // end namespace recycler
+} // end namespace cppuddle
 #endif
diff --git a/include/kokkos_buffer_util.hpp b/include/recycling_kokkos_view.hpp
similarity index 76%
rename from include/kokkos_buffer_util.hpp
rename to include/recycling_kokkos_view.hpp
index 2945b422..d89dc0c4 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/recycling_kokkos_view.hpp
@@ -7,10 +7,10 @@
 #define KOKKOS_BUFFER_UTIL_HPP
 #include <Kokkos_Core.hpp>
 #include <memory>
-#include <buffer_manager.hpp>
+#include <detail/buffer_recycler.hpp>
 #include <type_traits>
 
-namespace recycler {
+namespace cppuddle {
 
 template<typename element_type, typename alloc_type>
 struct view_deleter {
@@ -24,7 +24,7 @@ struct view_deleter {
 };
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class aggregated_recycled_view : public kokkos_type {
+class aggregated_recycle_view : public kokkos_type {
 private:
   alloc_type allocator;
   size_t total_elements{0};
@@ -34,7 +34,7 @@ class aggregated_recycled_view : public kokkos_type {
 public:
   using view_type = kokkos_type;
   template <class... Args>
-  explicit aggregated_recycled_view(alloc_type &alloc, Args... args)
+  explicit aggregated_recycle_view(alloc_type &alloc, Args... args)
       : kokkos_type(
             alloc.allocate(kokkos_type::required_allocation_size(args...) /
                            sizeof(element_type)),
@@ -45,15 +45,15 @@ class aggregated_recycled_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            alloc, total_elements)) {}
 
-  aggregated_recycled_view(
-      const aggregated_recycled_view<kokkos_type, alloc_type, element_type> &other)
+  aggregated_recycle_view(
+      const aggregated_recycle_view<kokkos_type, alloc_type, element_type> &other)
       : kokkos_type(other), allocator(other.allocator) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  aggregated_recycled_view<kokkos_type, alloc_type, element_type> &
-  operator=(const aggregated_recycled_view<kokkos_type, alloc_type, element_type> &other) {
+  aggregated_recycle_view<kokkos_type, alloc_type, element_type> &
+  operator=(const aggregated_recycle_view<kokkos_type, alloc_type, element_type> &other) {
     data_ref_counter = other.data_ref_counter;
     allocator = other.allocator;
     kokkos_type::operator=(other);
@@ -61,15 +61,15 @@ class aggregated_recycled_view : public kokkos_type {
     return *this;
   }
 
-  aggregated_recycled_view(
-      aggregated_recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+  aggregated_recycle_view(
+      aggregated_recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept
       : kokkos_type(other), allocator(other.allocator) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  aggregated_recycled_view<kokkos_type, alloc_type, element_type> &operator=(
-      aggregated_recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+  aggregated_recycle_view<kokkos_type, alloc_type, element_type> &operator=(
+      aggregated_recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
     data_ref_counter = other.data_ref_counter;
     allocator = other.allocator;
     kokkos_type::operator=(other);
@@ -77,12 +77,12 @@ class aggregated_recycled_view : public kokkos_type {
     return *this;
   }
 
-  ~aggregated_recycled_view() {}
+  ~aggregated_recycle_view() {}
 };
 
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class recycled_view : public kokkos_type {
+class recycle_view : public kokkos_type {
 private:
   size_t total_elements{0};
   std::shared_ptr<element_type> data_ref_counter;
@@ -92,7 +92,7 @@ class recycled_view : public kokkos_type {
   static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(Args... args)
+  recycle_view(Args... args)
       : kokkos_type(
             alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
@@ -104,7 +104,7 @@ class recycled_view : public kokkos_type {
 
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(const size_t device_id, Args... args)
+  recycle_view(const size_t device_id, Args... args)
       : kokkos_type(
             alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
@@ -117,7 +117,7 @@ class recycled_view : public kokkos_type {
   template <
       typename layout_t,
       std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
-  recycled_view(std::size_t device_id, layout_t layout)
+  recycle_view(std::size_t device_id, layout_t layout)
       : kokkos_type(
             alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) /
                                sizeof(element_type)),
@@ -127,41 +127,41 @@ class recycled_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            alloc_type{device_id}, total_elements)) {}
 
-  recycled_view(
-      const recycled_view<kokkos_type, alloc_type, element_type> &other)
+  recycle_view(
+      const recycle_view<kokkos_type, alloc_type, element_type> &other)
       : kokkos_type(other) {
     total_elements = other.total_elements;
     data_ref_counter = other.data_ref_counter;
 
   }
 
-  recycled_view<kokkos_type, alloc_type, element_type> &
-  operator=(const recycled_view<kokkos_type, alloc_type, element_type> &other) {
+  recycle_view<kokkos_type, alloc_type, element_type> &
+  operator=(const recycle_view<kokkos_type, alloc_type, element_type> &other) {
     data_ref_counter = other.data_ref_counter;
     kokkos_type::operator=(other);
     total_elements = other.total_elements;
     return *this;
   }
 
-  recycled_view(
-      recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+  recycle_view(
+      recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept
       : kokkos_type(other) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  recycled_view<kokkos_type, alloc_type, element_type> &operator=(
-      recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+  recycle_view<kokkos_type, alloc_type, element_type> &operator=(
+      recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
     data_ref_counter = other.data_ref_counter;
     kokkos_type::operator=(other);
     total_elements = other.total_elements;
     return *this;
   }
 
-  ~recycled_view() {  }
+  ~recycle_view() {  }
 };
 
 
-} // end namespace recycler
+} // end namespace cppuddle
 
 #endif
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_recycling_allocators.hpp
similarity index 93%
rename from include/sycl_buffer_util.hpp
rename to include/sycl_recycling_allocators.hpp
index 61d22f8f..63511de5 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_recycling_allocators.hpp
@@ -3,17 +3,17 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef SYCL_BUFFER_UTIL_HPP
-#define SYCL_BUFFER_UTIL_HPP
+#ifndef SYCL_RECYCLING_ALLOCATORS_HPP
+#define SYCL_RECYCLING_ALLOCATORS_HPP
 
-#include "buffer_manager.hpp"
+#include "detail/buffer_recycler.hpp"
+#include "detail/config.hpp"
 
 #include <CL/sycl.hpp>
 #include <stdexcept>
 #include <string>
 
-namespace recycler {
-
+namespace cppuddle {
 namespace detail {
 
 static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
@@ -79,5 +79,5 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_device =
     detail::recycle_allocator<T, detail::sycl_device_default_allocator<T>>;
 
-} // end namespace recycler
+} // end namespace cppuddle
 #endif
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index c3c09217..1bc8cf53 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/buffer_manager.hpp"
-#include "../include/aligned_buffer_util.hpp"
+#include "../include/detail/buffer_recycler.hpp"
+#include "../include/aligned_recycling_allocators.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
               << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::aggressive_recycle_aligned<double, 32>>
+      std::vector<double, cppuddle::aggressive_recycle_aligned<double, 32>>
         test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
@@ -92,8 +92,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_performance_counters();
+  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Recycle Test:
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::recycle_aligned<double, 32>> test1(
+      std::vector<double, cppuddle::recycle_aligned<double, 32>> test1(
           array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
@@ -114,8 +114,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_performance_counters();
+  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Same test using std::allocator:
@@ -146,7 +146,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  recycler::print_performance_counters();
+  cppuddle::print_performance_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index 9d8cc44b..d0e632f0 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -15,7 +15,7 @@
 
 #include <boost/program_options.hpp>
 
-#include "../include/buffer_manager.hpp"
+#include "../include/detail/buffer_recycler.hpp"
 
 int hpx_main(int argc, char *argv[]) {
 
@@ -112,7 +112,7 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, recycler::recycle_std<double>> test6(array_size,
+            std::vector<double, cppuddle::recycle_std<double>> test6(array_size,
                                                                      double{});
           });
         }
@@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Recycle allocation test took " << recycle_duration
                 << "ms" << std::endl;
     }
-    recycler::print_performance_counters();
-    recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::print_performance_counters();
+    cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
     // ensure that at least 4 buffers have to created for unit testing
     {
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer1(
+      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer1(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer2(
+      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer2(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer3(
+      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer3(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer4(
+      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer4(
           array_size, double{});
     }
 
@@ -153,7 +153,7 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, recycler::aggressive_recycle_std<double>> test6(
+            std::vector<double, cppuddle::aggressive_recycle_std<double>> test6(
                 array_size, double{});
           });
         }
@@ -167,8 +167,8 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Aggressive recycle allocation test took "
                 << aggressive_duration << "ms" << std::endl;
     }
-    recycler::print_performance_counters();
-    recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::print_performance_counters();
+    cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 004368a4..54fb5dee 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -3,7 +3,7 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/buffer_manager.hpp"
+#include "../include/detail/buffer_recycler.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::aggressive_recycle_std<double>> test1(
+      std::vector<double, cppuddle::aggressive_recycle_std<double>> test1(
           array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
@@ -88,8 +88,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_performance_counters();
+  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Recycle Test:
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::recycle_std<double>> test1(array_size, double{});
+      std::vector<double, cppuddle::recycle_std<double>> test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -108,8 +108,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_performance_counters();
+  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Same test using std::allocator:
@@ -138,7 +138,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  recycler::print_performance_counters();
+  cppuddle::print_performance_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else

From d9ae1f82cb30460aa941f9e2bd98c379a1867487 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Wed, 6 Mar 2024 18:09:50 +0100
Subject: [PATCH 02/19] Rework allocator namespace 2

---
 include/aligned_recycling_allocators.hpp      |  7 ++-
 include/cuda_recycling_allocators.hpp         | 13 +++--
 ...fer_recycler.hpp => buffer_management.hpp} | 50 ++++++++-----------
 include/hip_recycling_allocators.hpp          | 14 ++++--
 include/recycling_kokkos_view.hpp             |  4 +-
 include/sycl_recycling_allocators.hpp         | 14 ++++--
 tests/allocator_aligned_test.cpp              | 11 ++--
 tests/allocator_hpx_test.cpp                  | 10 ++--
 ...llocator_kokkos_executor_for_loop_test.cpp | 20 ++++----
 tests/allocator_kokkos_test.cpp               | 10 ++--
 tests/allocator_test.cpp                      | 17 ++++---
 11 files changed, 93 insertions(+), 77 deletions(-)
 rename include/detail/{buffer_recycler.hpp => buffer_management.hpp} (96%)

diff --git a/include/aligned_recycling_allocators.hpp b/include/aligned_recycling_allocators.hpp
index b1ed5dce..039a19f2 100644
--- a/include/aligned_recycling_allocators.hpp
+++ b/include/aligned_recycling_allocators.hpp
@@ -6,22 +6,27 @@
 #ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP
 #define ALIGNED_RECYCLING_ALLOCATORS_HPP
 
-#include "detail/buffer_recycler.hpp"
 #include <boost/align/aligned_allocator.hpp>
+#include "buffer_management_interface.hpp"
 
 namespace cppuddle {
 namespace device_selection {
 template <typename T, size_t alignement>
+/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
+/// select_device_functor does not compile for > 1 GPU (to make sure all /
+/// relevant allocators support multigpu)
 struct select_device_functor<
     T, boost::alignment::aligned_allocator<T, alignement>> {
   void operator()(const size_t device_id) {}
 };
 } // namespace device_selection
 
+/// Recycling allocator for boost aligned memory
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_aligned = detail::recycle_allocator<
     T, boost::alignment::aligned_allocator<T, alignement>>;
+/// Recycling allocator for boost aligned memory (reusing previous content as well)
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_aligned = detail::aggressive_recycle_allocator<
diff --git a/include/cuda_recycling_allocators.hpp b/include/cuda_recycling_allocators.hpp
index 2452a563..b4cf8efb 100644
--- a/include/cuda_recycling_allocators.hpp
+++ b/include/cuda_recycling_allocators.hpp
@@ -6,16 +6,16 @@
 #ifndef CUDA_RECYCLING_ALLOCATORS_HPP
 #define CUDA_RECYCLING_ALLOCATORS_HPP
 
-#include "detail/buffer_recycler.hpp"
-#include "detail/config.hpp"
-
 #include <cuda_runtime.h>
 #include <stdexcept>
 #include <string>
 
+#include "buffer_management_interface.hpp"
+
 namespace cppuddle {
 namespace detail {
 
+/// Underlying host allocator for CUDA pinned memory
 template <class T> struct cuda_pinned_allocator {
   using value_type = T;
   cuda_pinned_allocator() noexcept = default;
@@ -57,6 +57,7 @@ constexpr bool operator!=(cuda_pinned_allocator<T> const &,
   return false;
 }
 
+/// Underlying allocator for CUDA device memory
 template <class T> struct cuda_device_allocator {
   using value_type = T;
   cuda_device_allocator() noexcept = default;
@@ -99,23 +100,28 @@ constexpr bool operator!=(cuda_device_allocator<T> const &,
 
 
 namespace device_selection {
+/// GPU device selector using the CUDA API for pinned host allocations
 template <typename T>
 struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
   void operator()(const size_t device_id) { cudaSetDevice(device_id); }
 };
+/// GPU selector using the CUDA API for pinned host allocations
 template <typename T>
 struct select_device_functor<T, detail::cuda_device_allocator<T>> {
   void operator()(const size_t device_id) { cudaSetDevice(device_id); }
 };
 } // namespace device_selection
 
+/// Recycling allocator for CUDA pinned host memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_host =
     detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>;
+/// Recycling allocator for CUDA device memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_device =
     detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
 
+/// RAII wrapper for CUDA device memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_device_buffer {
   recycle_allocator_cuda_device<T> allocator;
@@ -139,6 +145,7 @@ struct cuda_device_buffer {
 
 };
 
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
 template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct cuda_aggregated_device_buffer {
   T *device_side_buffer;
diff --git a/include/detail/buffer_recycler.hpp b/include/detail/buffer_management.hpp
similarity index 96%
rename from include/detail/buffer_recycler.hpp
rename to include/detail/buffer_management.hpp
index 3ad739d8..5d640983 100644
--- a/include/detail/buffer_recycler.hpp
+++ b/include/detail/buffer_management.hpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef BUFFER_MANAGER_HPP
-#define BUFFER_MANAGER_HPP
+#ifndef BUFFER_MANAGEMENT_HPP
+#define BUFFER_MANAGEMENT_HPP
 
 #include <atomic>
 #include <cassert>
@@ -47,6 +47,10 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 namespace cppuddle {
 
 namespace device_selection {
+/// Default device selector - No MultGPU support
+/** Throws a runtime error if max_number_gpus > 1 (defined by cmake variable
+ * CPPUDDLE_WITH_MAX_NUMBER_GPUS). Needs to be specialized for an allocator to
+ * provide MultiGPU support (see CPPuddle CUDA/HIP allocators for examples) **/
 template <typename T, typename Allocator> struct select_device_functor {
   void operator()(const size_t device_id) {
     if constexpr (max_number_gpus > 1)
@@ -55,14 +59,11 @@ template <typename T, typename Allocator> struct select_device_functor {
           "(by having a select_device_functor overload");
   }
 };
-template <typename T> struct select_device_functor<T, std::allocator<T>> {
-  void operator()(const size_t device_id) {}
-};
 } // namespace device_selection
 
 namespace detail {
 
-
+/// Singleton interface to all buffer_managers
 class buffer_interface {
 public:
 #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
@@ -87,12 +88,14 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
     return Host_Allocator{}.deallocate(p, number_elements);
   }
 #else
-  /// Returns and allocated buffer of the requested size - this may be a reused
-  /// buffer
+  /// Primary method to allocate a buffer with CPPuddle: Returns and allocated /
+  /// buffer of the requested size - this may be a reused buffer. The method
+  /// figures out the correct buffer_manager and gets such a buffer from it.
+  /// Should be called from an allocator implementation, not directly
   template <typename T, typename Host_Allocator>
   static T *get(size_t number_elements, bool manage_content_lifetime = false,
-      std::optional<size_t> location_hint = std::nullopt, 
-      std::optional<size_t> device_id = std::nullopt) {
+                std::optional<size_t> location_hint = std::nullopt,
+                std::optional<size_t> device_id = std::nullopt) {
     try {
       return buffer_manager<T, Host_Allocator>::get(
           number_elements, manage_content_lifetime, location_hint, device_id);
@@ -102,11 +105,14 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
       throw;
     }
   }
-  /// Marks an buffer as unused and fit for reusage
+  /// Primary method to deallocate a buffer with CPPuddle:Marks an buffer as /
+  /// unused and fit for reusage. The method figures out the correct buffer
+  /// manager and marks the buffer there. Should be called from an allocator
+  /// implementation, not directly
   template <typename T, typename Host_Allocator>
   static void mark_unused(T *p, size_t number_elements,
-      std::optional<size_t> location_hint = std::nullopt, 
-      std::optional<size_t> device_id = std::nullopt) {
+                          std::optional<size_t> location_hint = std::nullopt,
+                          std::optional<size_t> device_id = std::nullopt) {
     try {
       return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
           location_hint, device_id);
@@ -117,6 +123,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
     }
   }
 #endif
+  /// Register all CPPuddle counters as HPX performance counters
   template <typename T, typename Host_Allocator>
     static void register_allocator_counters_with_hpx(void) {
 #ifdef CPPUDDLE_HAVE_COUNTERS
@@ -915,23 +922,6 @@ operator!=(aggressive_recycle_allocator<T, Host_Allocator> const &,
     return true;
 }
 } // namespace detail
-
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_std = detail::recycle_allocator<T, std::allocator<T>>;
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using aggressive_recycle_std =
-    detail::aggressive_recycle_allocator<T, std::allocator<T>>;
-
-inline void print_performance_counters() { detail::buffer_interface::print_performance_counters(); }
-/// Deletes all buffers (even ones still marked as used), delete the buffer
-/// managers and the recycler itself
-inline void force_cleanup() { detail::buffer_interface::clean_all(); }
-/// Deletes all buffers currently marked as unused
-inline void cleanup() { detail::buffer_interface::clean_unused_buffers(); }
-/// Deletes all buffers (even ones still marked as used), delete the buffer
-/// managers and the recycler itself. Disallows further usage.
-inline void finalize() { detail::buffer_interface::finalize(); }
-
 } // end namespace cppuddle
 
 #endif
diff --git a/include/hip_recycling_allocators.hpp b/include/hip_recycling_allocators.hpp
index 465bd5fe..f540b544 100644
--- a/include/hip_recycling_allocators.hpp
+++ b/include/hip_recycling_allocators.hpp
@@ -6,17 +6,16 @@
 #ifndef HIP_RECYCLING_ALLOCATORS_HPP
 #define HIP_RECYCLING_ALLOCATORS_HPP
 
-#include "detail/buffer_recycler.hpp"
-#include "detail/config.hpp"
-
 #include <hip/hip_runtime.h>
 #include <stdexcept>
 #include <string>
 
-namespace cppuddle {
+#include "buffer_management_interface.hpp"
 
+namespace cppuddle {
 namespace detail {
 
+/// Underlying host allocator for HIP pinned memory
 template <class T> struct hip_pinned_allocator {
   using value_type = T;
   hip_pinned_allocator() noexcept = default;
@@ -63,6 +62,7 @@ constexpr bool operator!=(hip_pinned_allocator<T> const &,
   return false;
 }
 
+/// Underlying allocator for HIP device memory
 template <class T> struct hip_device_allocator {
   using value_type = T;
   hip_device_allocator() noexcept = default;
@@ -106,23 +106,28 @@ constexpr bool operator!=(hip_device_allocator<T> const &,
 
 
 namespace device_selection {
+/// GPU device selector using the HIP API for pinned host allocations
 template <typename T>
 struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
   void operator()(const size_t device_id) { hipSetDevice(device_id); }
 };
+/// GPU selector using the HIP API for pinned host allocations
 template <typename T>
 struct select_device_functor<T, detail::hip_device_allocator<T>> {
   void operator()(const size_t device_id) { hipSetDevice(device_id); }
 };
 } // namespace device_selection
 
+/// Recycling allocator for HIP pinned host memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_host =
     detail::aggressive_recycle_allocator<T, detail::hip_pinned_allocator<T>>;
+/// Recycling allocator for HIP device memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_device =
     detail::recycle_allocator<T, detail::hip_device_allocator<T>>;
 
+/// RAII wrapper for HIP device memory
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct hip_device_buffer {
   recycle_allocator_hip_device<T> allocator;
@@ -146,6 +151,7 @@ struct hip_device_buffer {
 
 };
 
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
 template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 struct hip_aggregated_device_buffer {
   T *device_side_buffer;
diff --git a/include/recycling_kokkos_view.hpp b/include/recycling_kokkos_view.hpp
index d89dc0c4..c55d3738 100644
--- a/include/recycling_kokkos_view.hpp
+++ b/include/recycling_kokkos_view.hpp
@@ -7,9 +7,11 @@
 #define KOKKOS_BUFFER_UTIL_HPP
 #include <Kokkos_Core.hpp>
 #include <memory>
-#include <detail/buffer_recycler.hpp>
 #include <type_traits>
 
+#include "buffer_management_interface.hpp"
+
+
 namespace cppuddle {
 
 template<typename element_type, typename alloc_type>
diff --git a/include/sycl_recycling_allocators.hpp b/include/sycl_recycling_allocators.hpp
index 63511de5..66ba1fb8 100644
--- a/include/sycl_recycling_allocators.hpp
+++ b/include/sycl_recycling_allocators.hpp
@@ -6,18 +6,21 @@
 #ifndef SYCL_RECYCLING_ALLOCATORS_HPP
 #define SYCL_RECYCLING_ALLOCATORS_HPP
 
-#include "detail/buffer_recycler.hpp"
-#include "detail/config.hpp"
-
 #include <CL/sycl.hpp>
 #include <stdexcept>
 #include <string>
 
+#include "buffer_management_interface.hpp"
+
 namespace cppuddle {
-namespace detail {
 
+namespace device_selection {
+// No MutliGPU support yet, hence no select_device_function required
 static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
+} // namespace device_selection
 
+namespace detail {
+/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{})
 template <class T> struct sycl_host_default_allocator {
   using value_type = T;
   sycl_host_default_allocator() noexcept = default;
@@ -44,6 +47,7 @@ constexpr bool operator!=(sycl_host_default_allocator<T> const &,
   return false;
 }
 
+/// Underlying allocator for SYCL device memory (using the sycl::default_selector{})
 template <class T> struct sycl_device_default_allocator {
   using value_type = T;
   sycl_device_default_allocator() noexcept = default;
@@ -72,9 +76,11 @@ constexpr bool operator!=(sycl_device_default_allocator<T> const &,
 
 } // end namespace detail
 
+/// Recycling allocator for SYCL pinned host memory (default device)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_host =
     detail::aggressive_recycle_allocator<T, detail::sycl_host_default_allocator<T>>;
+/// Recycling allocator for SYCL device memory (default device)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_device =
     detail::recycle_allocator<T, detail::sycl_device_default_allocator<T>>;
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index 1bc8cf53..65d1df64 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -3,7 +3,6 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/detail/buffer_recycler.hpp"
 #include "../include/aligned_recycling_allocators.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
@@ -92,8 +91,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  cppuddle::print_performance_counters();
-  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_buffer_counters();
+  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Recycle Test:
@@ -114,8 +113,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  cppuddle::print_performance_counters();
-  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::print_buffer_counters();
+  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Same test using std::allocator:
@@ -146,7 +145,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  cppuddle::print_performance_counters();
+  cppuddle::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index d0e632f0..4af0878c 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -15,7 +15,7 @@
 
 #include <boost/program_options.hpp>
 
-#include "../include/detail/buffer_recycler.hpp"
+#include "std_recycling_allocators.hpp"
 
 int hpx_main(int argc, char *argv[]) {
 
@@ -126,8 +126,8 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Recycle allocation test took " << recycle_duration
                 << "ms" << std::endl;
     }
-    cppuddle::print_performance_counters();
-    cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::print_buffer_counters();
+    cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
@@ -167,8 +167,8 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Aggressive recycle allocation test took "
                 << aggressive_duration << "ms" << std::endl;
     }
-    cppuddle::print_performance_counters();
-    cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::print_buffer_counters();
+    cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index 7708fe56..47fc83f4 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -18,12 +18,12 @@
 #include <cstdio>
 #include <typeinfo>
 
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-#include "../include/kokkos_buffer_util.hpp"
 #include <hpx/timing/high_resolution_timer.hpp>
 #include <memory>
 
+#include "cuda_recycling_allocators.hpp"
+#include "recycling_kokkos_view.hpp"
+
 // Assert during Release builds as well for this file:
 #undef NDEBUG
 #include <cassert> // reinclude the header to update the definition of assert()
@@ -37,7 +37,7 @@ using kokkos_um_array =
     Kokkos::View<T **, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
 using recycled_host_view =
-    recycler::recycled_view<kokkos_um_array<T>, recycler::recycle_std<T>, T>;
+    cppuddle::recycled_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
 
 
 // Device views using recycle allocators
@@ -46,8 +46,8 @@ using kokkos_um_device_array =
     Kokkos::View<T **, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
 using recycled_device_view =
-    recycler::recycled_view<kokkos_um_device_array<T>,
-                            recycler::recycle_allocator_cuda_device<T>, T>;
+    cppuddle::recycled_view<kokkos_um_device_array<T>,
+                            cppuddle::recycle_allocator_cuda_device<T>, T>;
 
 // Host views using pinned memory recycle allocators
 template <class T>
@@ -56,8 +56,8 @@ using kokkos_um_pinned_array =
                  Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
 using recycled_pinned_view =
-    recycler::recycled_view<kokkos_um_pinned_array<T>,
-                            recycler::recycle_allocator_cuda_host<T>, T>;
+    cppuddle::recycled_view<kokkos_um_pinned_array<T>,
+                            cppuddle::recycle_allocator_cuda_host<T>, T>;
 
 template <typename Executor, typename ViewType>
 auto get_iteration_policy(const Executor &&executor,
@@ -143,11 +143,11 @@ int main(int argc, char *argv[]) {
 
   // otherwise the HPX cuda polling futures won't work
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
-  recycler::print_performance_counters();
+  cppuddle::print_buffer_counters();
   // Cleanup all cuda views 
   // (otherwise the cuda driver might shut down before this gets done automatically at
   // the end of the programm)
-  recycler::force_cleanup();
+  cppuddle::force_buffer_cleanup();
   return hpx::finalize();
 }
 
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index e2770458..c8045d3e 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -13,9 +13,6 @@
 #include <cstdio>
 #include <typeinfo>
 
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-#include "../include/kokkos_buffer_util.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -24,6 +21,9 @@
 #include <boost/program_options.hpp>
 #include <memory>
 
+#include "cuda_recycling_allocators.hpp"
+#include "recycling_kokkos_view.hpp"
+
 using kokkos_array =
     Kokkos::View<float[1000], Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
@@ -33,7 +33,7 @@ using kokkos_um_array =
     Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
 using recycled_host_view =
-    recycler::recycled_view<kokkos_um_array<T>, recycler::recycle_std<T>, T>;
+    cppuddle::recycled_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
 
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
                         });
     Kokkos::fence();
   }
-  recycler::print_performance_counters();
+  cppuddle::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 54fb5dee..8fc7c5bb 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -3,7 +3,6 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/detail/buffer_recycler.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -17,6 +16,8 @@
 #include <string>
 #include <typeinfo>
 
+#include "std_recycling_allocators.hpp"
+
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
 #else
@@ -88,9 +89,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  cppuddle::print_performance_counters();
-  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
-                             // comparison
+  cppuddle::print_buffer_counters();
+  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+                                    // better comparison
 
   // Recycle Test:
   {
@@ -108,9 +109,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  cppuddle::print_performance_counters();
-  cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better
-                             // comparison
+  cppuddle::print_buffer_counters();
+  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+                                    // better comparison
 
   // Same test using std::allocator:
   {
@@ -138,7 +139,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  cppuddle::print_performance_counters();
+  cppuddle::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else

From 0857471e17b5b6ab0cf94c762fa2c6ee690a8dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Wed, 6 Mar 2024 18:10:07 +0100
Subject: [PATCH 03/19] Add compatibility with old interface

Only comes with deprecation warnings...
---
 include/aligned_buffer_util.hpp         | 24 ++++++++++++++
 include/buffer_management_interface.hpp | 21 +++++++++++++
 include/buffer_manager.hpp              | 24 ++++++++++++++
 include/cuda_buffer_util.hpp            | 42 +++++++++++++++++++++++++
 include/hip_buffer_util.hpp             | 40 +++++++++++++++++++++++
 include/kokkos_buffer_util.hpp          | 21 +++++++++++++
 include/std_recycling_allocators.hpp    | 32 +++++++++++++++++++
 include/sycl_buffer_util.hpp            | 33 +++++++++++++++++++
 8 files changed, 237 insertions(+)
 create mode 100644 include/aligned_buffer_util.hpp
 create mode 100644 include/buffer_management_interface.hpp
 create mode 100644 include/buffer_manager.hpp
 create mode 100644 include/cuda_buffer_util.hpp
 create mode 100644 include/hip_buffer_util.hpp
 create mode 100644 include/kokkos_buffer_util.hpp
 create mode 100644 include/std_recycling_allocators.hpp
 create mode 100644 include/sycl_buffer_util.hpp

diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
new file mode 100644
index 00000000..4c409521
--- /dev/null
+++ b/include/aligned_buffer_util.hpp
@@ -0,0 +1,24 @@
+// Copyright (c) 2020-2021 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ALIGNED_BUFFER_UTIL_HPP
+#define ALIGNED_BUFFER_UTIL_HPP
+
+#include "aligned_recycling_allocators.hpp"
+
+namespace recycler {
+
+[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]]
+template <typename T, std::size_t alignement,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_aligned = cppuddle::recycle_aligned<T,alignement>;
+[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]]
+template <typename T, std::size_t alignement,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using aggressive_recycle_aligned = cppuddle::aggressive_recycle_aligned<T, alignement>;
+
+} // namespace recycler
+
+#endif
diff --git a/include/buffer_management_interface.hpp b/include/buffer_management_interface.hpp
new file mode 100644
index 00000000..b38fc84c
--- /dev/null
+++ b/include/buffer_management_interface.hpp
@@ -0,0 +1,21 @@
+#ifndef BUFFER_MANAGEMENT_INTERFACE_HPP
+#define BUFFER_MANAGEMENT_INTERFACE_HPP
+
+#include "detail/buffer_management.hpp"
+
+namespace cppuddle {
+
+/// Print performance counters of all buffer managers to stdout
+inline void print_buffer_counters() { detail::buffer_interface::print_performance_counters(); }
+/// Deletes all buffers (even ones still marked as used), delete the buffer
+/// managers and the recycler itself
+inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); }
+/// Deletes all buffers currently marked as unused
+inline void unused_buffer_cleanup() { detail::buffer_interface::clean_unused_buffers(); }
+/// Deletes all buffers (even ones still marked as used), delete the buffer
+/// managers and the recycler itself. Disallows further usage.
+inline void finalize() { detail::buffer_interface::finalize(); }
+
+} // end namespace cppuddle
+
+#endif
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
new file mode 100644
index 00000000..f84c4259
--- /dev/null
+++ b/include/buffer_manager.hpp
@@ -0,0 +1,24 @@
+#ifndef BUFFER_MANAGER_INTERFACE_HPP
+#define BUFFER_MANAGER_HPP
+
+#include "buffer_management_interface.hpp"
+
+namespace recycler {
+
+[[deprecated("Use cppuddle::print_buffer_counters() instead")]]
+inline void print_performance_counters() { cppuddle::print_buffer_counters(); }
+/// Deletes all buffers (even ones still marked as used), delete the buffer
+/// managers and the recycler itself
+[[deprecated("Use cppuddle::force_buffer_cleanup() instead")]]
+inline void force_cleanup() { cppuddle::force_buffer_cleanup(); }
+/// Deletes all buffers currently marked as unused
+[[deprecated("Use cppuddle::unused_buffer_cleanup() instead")]]
+inline void cleanup() { cppuddle::unused_buffer_cleanup(); }
+/// Deletes all buffers (even ones still marked as used), delete the buffer
+/// managers and the recycler itself. Disallows further usage.
+[[deprecated("Use cppuddle::finalize() instead")]]
+inline void finalize() { detail::buffer_interface::finalize(); }
+
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
new file mode 100644
index 00000000..f0db9a7b
--- /dev/null
+++ b/include/cuda_buffer_util.hpp
@@ -0,0 +1,42 @@
+// Copyright (c) 2020-2023 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_BUFFER_UTIL_HPP
+#define CUDA_BUFFER_UTIL_HPP
+
+#include "cuda_recycling_allocators.hpp"
+namespace recycler {
+
+namespace detail {
+
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <class T>
+using cuda_pinned_allocator = cppuddle::detail::cuda_pinned_allocator<T>;
+
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <class T>
+using cuda_device_allocator = cppuddle::detail::cuda_device_allocator<T>;
+
+} // end namespace detail
+
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_cuda_host =
+    cppuddle::recycle_allocator_cuda_host<T>;
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_cuda_device =
+    cppuddle::recycle_allocator_cuda_device<T>;
+
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_device_buffer = cppuddle::cuda_device_buffer<T>;
+
+[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_aggregated_device_buffer = cppuddle::cuda_aggregated_device_buffer<T>;
+
+} // end namespace recycler
+#endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
new file mode 100644
index 00000000..a0b6fc05
--- /dev/null
+++ b/include/hip_buffer_util.hpp
@@ -0,0 +1,40 @@
+// Copyright (c: 2020-2021 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_BUFFER_UTIL_HPP
+#define HIP_BUFFER_UTIL_HPP
+
+#include "hip_recycling_allocators.hpp"
+
+namespace recycler {
+
+namespace detail {
+
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <class T> 
+using hip_pinned_allocator = cppuddle::detail::hip_pinned_allocator<T>;
+
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <class T> 
+using hip_device_allocator = cppuddle::detail::hip_device_allocator<T>;
+} // end namespace detail
+
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_hip_host = cppuddle::recycle_allocator_hip_host<T>;
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_hip_device = cppuddle::recycle_allocator_hip_device<T>;
+
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_device_buffer = cppuddle::hip_device_buffer<T>;
+
+[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_aggregated_device_buffer = cppuddle::hip_aggregated_device_buffer<T>;
+
+} // end namespace recycler
+#endif
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
new file mode 100644
index 00000000..22fb4d88
--- /dev/null
+++ b/include/kokkos_buffer_util.hpp
@@ -0,0 +1,21 @@
+// Copyright (c) 2020-2021 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef KOKKOS_BUFFER_UTIL_HPP
+#define KOKKOS_BUFFER_UTIL_HPP
+#include "recycling_kokkos_view.hpp"
+
+[[deprecated("Use aggregated_recycle_view from header recycling_kokkos_view.hpp instead")]]
+namespace recycler {
+template <typename kokkos_type, typename alloc_type, typename element_type>
+using aggregated_recycled_view = cppuddle::aggregated_recycle_view<kokkos_type, alloc_type, element_type>;
+
+[[deprecated("Use recycle_view from header recycling_kokkos_view.hpp instead")]]
+template <typename kokkos_type, typename alloc_type, typename element_type>
+using recycled_view = cppuddle::recycle_view<kokkos_type, alloc_type, element_type>;
+
+} // end namespace recycler
+
+#endif
diff --git a/include/std_recycling_allocators.hpp b/include/std_recycling_allocators.hpp
new file mode 100644
index 00000000..a62390dd
--- /dev/null
+++ b/include/std_recycling_allocators.hpp
@@ -0,0 +1,32 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef STD_RECYCLING_ALLOCATORS_HPP
+#define STD_RECYCLING_ALLOCATORS_HPP
+
+#include "buffer_management_interface.hpp"
+
+namespace cppuddle {
+namespace device_selection {
+/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
+/// select_device_functor does not compile for > 1 GPU (to make sure all /
+/// relevant allocators support multigpu)
+template <typename T> struct select_device_functor<T, std::allocator<T>> {
+  void operator()(const size_t device_id) {}
+};
+} // namespace device_selection
+
+
+/// Recycling allocator for std memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_std = detail::recycle_allocator<T, std::allocator<T>>;
+/// Recycling allocator for boost aligned memory (reusing previous content as well)
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using aggressive_recycle_std =
+    detail::aggressive_recycle_allocator<T, std::allocator<T>>;
+
+} // namespace cppuddle
+
+#endif
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
new file mode 100644
index 00000000..f7971e1c
--- /dev/null
+++ b/include/sycl_buffer_util.hpp
@@ -0,0 +1,33 @@
+// Copyright (c: 2020-2021 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef SYCL_BUFFER_UTIL_HPP
+#define SYCL_BUFFER_UTIL_HPP
+
+#include "sycl_recycling_allocators.hpp"
+
+namespace recycler {
+
+namespace detail {
+
+[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
+template <class T> 
+using sycl_host_default_allocator = cppuddle::detail::sycl_host_default_allocator<T>;
+
+[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
+template <class T> 
+using sycl_device_default_allocator = cppuddle::detail::sycl_device_default_allocator<T>;
+
+} // end namespace detail
+
+[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_sycl_host = cppuddle::recycle_allocator_sycl_host<T>;
+[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_sycl_device = cppuddle::recycle_allocator_sycl_device<T>;
+
+} // end namespace recycler
+#endif

From e10d02166b3d1347bec4f731471fa23b41ba761b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Thu, 7 Mar 2024 11:47:55 +0100
Subject: [PATCH 04/19] Update gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2b25a731..bdc68a8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ spack.lock
 .clangd
 docs
 compile_commands.json
+spack-build*
+spack-configure-args.txt

From 0f06ea8c16b93f35634c05814b10878a791631da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Thu, 7 Mar 2024 11:48:41 +0100
Subject: [PATCH 05/19] Rework allocator namespace 3

---
 include/aggregation_manager.hpp               | 64 +++++++++----------
 include/aligned_buffer_util.hpp               | 11 ++--
 include/buffer_manager.hpp                    | 19 +++++-
 include/cuda_buffer_util.hpp                  | 40 +++++++-----
 include/hip_buffer_util.hpp                   | 42 +++++++-----
 include/kokkos_buffer_util.hpp                | 12 ++--
 include/stream_manager.hpp                    | 32 +++++-----
 include/sycl_buffer_util.hpp                  | 25 +++++---
 ...llocator_kokkos_executor_for_loop_test.cpp | 19 +++---
 tests/allocator_kokkos_test.cpp               |  9 +--
 10 files changed, 157 insertions(+), 116 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index cd1ca74b..99468658 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -58,7 +58,7 @@
 #pragma message                                                                \
     "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex"
 #endif
-namespace recycler {
+namespace cppuddle {
   using aggregation_mutex_t = hpx::mutex;
 }
 
@@ -158,7 +158,7 @@ template <typename Executor> class aggregated_function_call {
   std::any function_tuple;
   /// Stores the string of the first function call for debug output
   std::string debug_type_information;
-  recycler::aggregation_mutex_t debug_mut;
+  cppuddle::aggregation_mutex_t debug_mut;
 #endif
 
   std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
@@ -189,7 +189,7 @@ template <typename Executor> class aggregated_function_call {
 #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
     // needed for concurrent access to function_tuple and debug_type_information
     // Not required for normal use
-    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(debug_mut);
 #endif
     assert(!async_mode);
     assert(potential_async_promises.empty());
@@ -263,7 +263,7 @@ template <typename Executor> class aggregated_function_call {
 #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
     // needed for concurrent access to function_tuple and debug_type_information
     // Not required for normal use
-    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(debug_mut);
 #endif
     assert(async_mode);
     assert(!potential_async_promises.empty());
@@ -545,7 +545,7 @@ template <typename Executor> class Aggregated_Executor {
   /// slices have called it
   std::deque<aggregated_function_call<Executor>> function_calls;
   /// For synchronizing the access to the function calls list
-  recycler::aggregation_mutex_t mut;
+  cppuddle::aggregation_mutex_t mut;
 
   /// Data entry for a buffer allocation: void* pointer, size_t for
   /// buffer-size, atomic for the slice counter, location_id, gpu_id
@@ -556,7 +556,7 @@ template <typename Executor> class Aggregated_Executor {
   /// Map pointer to deque index for fast access in the deallocations
   std::unordered_map<void*,size_t> buffer_allocations_map;
   /// For synchronizing the access to the buffer_allocations
-  recycler::aggregation_mutex_t buffer_mut;
+  cppuddle::aggregation_mutex_t buffer_mut;
   std::atomic<size_t> buffer_counter = 0;
 
   /// Get new buffer OR get buffer already allocated by different slice
@@ -569,7 +569,7 @@ template <typename Executor> class Aggregated_Executor {
     // First: Check if it already has happened
     if (buffer_counter <= slice_alloc_counter) {
       // we might be the first! Lock...
-      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
+      std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
       // ... and recheck
       if (buffer_counter <= slice_alloc_counter) {
         constexpr bool manage_content_lifetime = false;
@@ -579,7 +579,7 @@ template <typename Executor> class Aggregated_Executor {
         // many different buffers for different aggregation sizes on different GPUs
         /* size_t location_id = gpu_id * instances_per_gpu; */
         // Use integer conversion to only use 0 16 32 ... as buckets
-        size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; 
+        size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; 
 #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
         if (max_slices == 1) {
           // get prefered location: aka the current hpx threads location
@@ -594,7 +594,7 @@ template <typename Executor> class Aggregated_Executor {
         // Buffer might be recycled from previous allocations by the
         // buffer_interface...
         T *aggregated_buffer =
-            recycler::detail::buffer_interface::get<T, Host_Allocator>(
+            cppuddle::detail::buffer_interface::get<T, Host_Allocator>(
                 size, manage_content_lifetime, location_id, gpu_id);
         // Create buffer entry for this buffer
         buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
@@ -665,12 +665,12 @@ template <typename Executor> class Aggregated_Executor {
     // Check if all slices are done with this buffer?
     if (buffer_allocation_counter == 0) {
       // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
-      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
+      std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
       // Only mark unused if another buffer has not done so already (and marked
       // it as invalid)
       if (valid) {
         assert(buffers_in_use == true);
-        recycler::detail::buffer_interface::mark_unused<T, Host_Allocator>(
+        cppuddle::detail::buffer_interface::mark_unused<T, Host_Allocator>(
             buffer_pointer, buffer_size, location_id, gpu_id);
         // mark buffer as invalid to prevent any other slice from marking the
         // buffer as unused
@@ -678,7 +678,7 @@ template <typename Executor> class Aggregated_Executor {
 
         const size_t current_deallocs = ++dealloc_counter;
         if (current_deallocs == buffer_counter) {
-          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+          std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
           buffers_in_use = false;
           if (!executor_slices_alive && !buffers_in_use) {
             slices_exhausted = false;
@@ -699,12 +699,12 @@ template <typename Executor> class Aggregated_Executor {
 
   /// Only meant to be accessed by the slice executors
   bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     assert(executor_wrapper);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, *executor_wrapper);
         overall_launch_counter = function_calls.size();
@@ -720,12 +720,12 @@ template <typename Executor> class Aggregated_Executor {
   /// Only meant to be accessed by the slice executors
   template <typename F, typename... Ts>
   void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     assert(executor_wrapper);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, false, *executor_wrapper);
         overall_launch_counter = function_calls.size();
@@ -744,12 +744,12 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     assert(executor_wrapper);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, *executor_wrapper);
         overall_launch_counter = function_calls.size();
@@ -765,12 +765,12 @@ template <typename Executor> class Aggregated_Executor {
   template <typename F, typename... Ts>
   hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
                                 Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     assert(slices_exhausted == true);
     assert(executor_wrapper);
     // Add function call object in case it hasn't happened for this launch yet
     if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
+      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
       if (overall_launch_counter <= slice_launch_counter) {
         function_calls.emplace_back(current_slices, true, *executor_wrapper);
         overall_launch_counter = function_calls.size();
@@ -784,12 +784,12 @@ template <typename Executor> class Aggregated_Executor {
   }
 
   bool slice_available(void) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     return !slices_exhausted;
   }
 
   std::optional<hpx::lcos::future<Executor_Slice>> request_executor_slice() {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
     if (!slices_exhausted) {
       const size_t local_slice_id = ++current_slices;
       if (local_slice_id == 1) {
@@ -797,7 +797,7 @@ template <typename Executor> class Aggregated_Executor {
         // TODO still required? Should be clean here already
         function_calls.clear();
         overall_launch_counter = 0;
-        std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
+        std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
 #ifndef NDEBUG
         for (const auto &buffer_entry : buffer_allocations) {
           const auto &[buffer_pointer_any, buffer_size,
@@ -861,7 +861,7 @@ template <typename Executor> class Aggregated_Executor {
         }
         // Launch all executor slices within this continuation
         current_continuation = fut.then([this](auto &&fut) {
-          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+          std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
           slices_exhausted = true;
           launched_slices = current_slices;
           size_t id = 0;
@@ -898,7 +898,7 @@ template <typename Executor> class Aggregated_Executor {
   }
   size_t launched_slices;
   void reduce_usage_counter(void) {
-    /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
+    /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
     assert(slices_exhausted == true);
     assert(executor_wrapper);
     assert(executor_slices_alive == true);
@@ -908,7 +908,7 @@ template <typename Executor> class Aggregated_Executor {
     // Last slice goes out scope?
     if (local_slice_id == 0) {
       // Mark executor fit for reusage
-      std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
+      std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
       executor_slices_alive = false; 
       if (!executor_slices_alive && !buffers_in_use) {
         // Release executor
@@ -1045,7 +1045,7 @@ class aggregation_pool {
           std::string("Trying to initialize cppuddle aggregation pool twice") +
           " Agg pool name: " + std::string(kernelname));
     }
-    if (num_devices > recycler::max_number_gpus) {
+    if (num_devices > cppuddle::max_number_gpus) {
       throw std::runtime_error(
           std::string(
               "Trying to initialize aggregation with more devices than the "
@@ -1055,7 +1055,7 @@ class aggregation_pool {
     number_devices = num_devices;
     for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
 
-      std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      std::lock_guard<cppuddle::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
       assert(instance()[gpu_id].aggregation_executor_pool.empty());
       for (int i = 0; i < number_of_executors; i++) {
         instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
@@ -1074,9 +1074,9 @@ class aggregation_pool {
           std::string("Trying to use cppuddle aggregation pool without first calling init") +
           " Agg poolname: " + std::string(kernelname));
     }
-    const size_t gpu_id = recycler::get_device_id(number_devices);
+    const size_t gpu_id = cppuddle::get_device_id(number_devices);
     /* const size_t gpu_id = 1; */
-    std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    std::lock_guard<cppuddle::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
     assert(!instance()[gpu_id].aggregation_executor_pool.empty());
     std::optional<hpx::lcos::future<
         typename Aggregated_Executor<Interface>::Executor_Slice>>
@@ -1128,11 +1128,11 @@ class aggregation_pool {
 private:
   /// Required for dealing with adding elements to the deque of
   /// aggregated_executors
-  recycler::aggregation_mutex_t pool_mutex;
+  cppuddle::aggregation_mutex_t pool_mutex;
   /// Global access instance
   static std::unique_ptr<aggregation_pool[]>& instance(void) {
     static std::unique_ptr<aggregation_pool[]> pool_instances{
-        new aggregation_pool[recycler::max_number_gpus]};
+        new aggregation_pool[cppuddle::max_number_gpus]};
     return pool_instances;
   }
   static inline size_t number_devices = 1;
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index 4c409521..0e5f1fb7 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -10,14 +10,17 @@
 
 namespace recycler {
 
-[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]]
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_aligned = cppuddle::recycle_aligned<T,alignement>;
-[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]]
+using recycle_aligned
+    [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_aligned<T, alignement>;
+
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using aggressive_recycle_aligned = cppuddle::aggressive_recycle_aligned<T, alignement>;
+using aggressive_recycle_aligned
+    [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
+        cppuddle::aggressive_recycle_aligned<T, alignement>;
 
 } // namespace recycler
 
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index f84c4259..d9d74c31 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -2,11 +2,24 @@
 #define BUFFER_MANAGER_HPP
 
 #include "buffer_management_interface.hpp"
+#include "std_recycling_allocators.hpp"
 
 namespace recycler {
 
-[[deprecated("Use cppuddle::print_buffer_counters() instead")]]
-inline void print_performance_counters() { cppuddle::print_buffer_counters(); }
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_std
+    [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_std<T>;
+
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using aggressive_recycle_aligned
+    [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
+        cppuddle::aggressive_recycle_std<T>;
+
+[[deprecated("Use cppuddle::print_buffer_counters() instead")]] 
+inline void print_performance_counters() {
+  cppuddle::print_buffer_counters();
+}
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself
 [[deprecated("Use cppuddle::force_buffer_cleanup() instead")]]
@@ -17,7 +30,7 @@ inline void cleanup() { cppuddle::unused_buffer_cleanup(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself. Disallows further usage.
 [[deprecated("Use cppuddle::finalize() instead")]]
-inline void finalize() { detail::buffer_interface::finalize(); }
+inline void finalize() { cppuddle::finalize(); }
 
 } // end namespace cppuddle
 
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index f0db9a7b..ffe47f8b 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -7,36 +7,42 @@
 #define CUDA_BUFFER_UTIL_HPP
 
 #include "cuda_recycling_allocators.hpp"
-namespace recycler {
 
+namespace recycler {
 namespace detail {
 
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
 template <class T>
-using cuda_pinned_allocator = cppuddle::detail::cuda_pinned_allocator<T>;
+using cuda_pinned_allocator
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::cuda_pinned_allocator<T>;
 
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
 template <class T>
-using cuda_device_allocator = cppuddle::detail::cuda_device_allocator<T>;
+using cuda_device_allocator
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::cuda_device_allocator<T>;
 
 } // end namespace detail
 
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_cuda_host =
-    cppuddle::recycle_allocator_cuda_host<T>;
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_cuda_device =
-    cppuddle::recycle_allocator_cuda_device<T>;
+using recycle_allocator_cuda_host
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_cuda_host<T>;
 
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using cuda_device_buffer = cppuddle::cuda_device_buffer<T>;
+using recycle_allocator_cuda_device
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_cuda_device<T>;
 
-[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]]
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using cuda_aggregated_device_buffer = cppuddle::cuda_aggregated_device_buffer<T>;
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_device_buffer
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::cuda_device_buffer<T>;
+
+template <typename T, typename Host_Allocator,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_aggregated_device_buffer
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::cuda_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index a0b6fc05..a2b5ca0c 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -12,29 +12,37 @@ namespace recycler {
 
 namespace detail {
 
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
-template <class T> 
-using hip_pinned_allocator = cppuddle::detail::hip_pinned_allocator<T>;
-
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
-template <class T> 
-using hip_device_allocator = cppuddle::detail::hip_device_allocator<T>;
+template <class T>
+using hip_pinned_allocator
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::hip_pinned_allocator<T>;
+
+template <class T>
+using hip_device_allocator
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::hip_device_allocator<T>;
 } // end namespace detail
 
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_hip_host = cppuddle::recycle_allocator_hip_host<T>;
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_hip_device = cppuddle::recycle_allocator_hip_device<T>;
+using recycle_allocator_hip_host
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_hip_host<T>;
 
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using hip_device_buffer = cppuddle::hip_device_buffer<T>;
+using recycle_allocator_hip_device
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_hip_device<T>;
 
-[[deprecated("Use from header hip_recycling_allocators.hpp instead")]]
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using hip_aggregated_device_buffer = cppuddle::hip_aggregated_device_buffer<T>;
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_device_buffer
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::hip_device_buffer<T>;
+
+template <typename T, typename Host_Allocator,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_aggregated_device_buffer
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::hip_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 22fb4d88..7b267619 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -7,14 +7,18 @@
 #define KOKKOS_BUFFER_UTIL_HPP
 #include "recycling_kokkos_view.hpp"
 
-[[deprecated("Use aggregated_recycle_view from header recycling_kokkos_view.hpp instead")]]
+
 namespace recycler {
 template <typename kokkos_type, typename alloc_type, typename element_type>
-using aggregated_recycled_view = cppuddle::aggregated_recycle_view<kokkos_type, alloc_type, element_type>;
+using aggregated_recycled_view [[deprecated(
+    "Use aggregated_recycle_view from header recycling_kokkos_view.hpp "
+    "instead")]] =
+    cppuddle::aggregated_recycle_view<kokkos_type, alloc_type, element_type>;
 
-[[deprecated("Use recycle_view from header recycling_kokkos_view.hpp instead")]]
 template <typename kokkos_type, typename alloc_type, typename element_type>
-using recycled_view = cppuddle::recycle_view<kokkos_type, alloc_type, element_type>;
+using recycled_view [[deprecated(
+    "Use recycle_view from header recycling_kokkos_view.hpp instead")]] =
+    cppuddle::recycle_view<kokkos_type, alloc_type, element_type>;
 
 } // end namespace recycler
 
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 5b0e3898..bfaba518 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -173,7 +173,7 @@ class stream_pool {
   template <class Interface, class Pool>
   static size_t get_next_device_id(const size_t number_gpus) noexcept {
     // TODO add round robin and min strategy
-    return recycler::get_device_id(number_gpus);
+    return cppuddle::get_device_id(number_gpus);
   }
 
   template <class Interface, class Pool>
@@ -195,11 +195,11 @@ class stream_pool {
     /// Deprecated! Use init_on_all_gpu or init_on_gpu
     template <typename... Ts>
     static void init(size_t number_of_streams, Ts ... executor_args) {
-      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */
+      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
       /*               "deprecated stream_pool::init does not support multigpu"); */
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().streampools.emplace_back(number_of_streams, executor_args...);
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
+      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
     }
 
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
@@ -207,13 +207,13 @@ class stream_pool {
     static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
-        for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) {
+        for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
           instance().select_gpu_function(gpu_id);
           instance().streampools.emplace_back(number_of_streams,
                                               executor_args...);
         }
       }
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
+      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
     }
 
     /// Per-GPU init allowing for different init parameters depending on the GPU 
@@ -226,40 +226,40 @@ class stream_pool {
         instance().streampools.emplace_back(number_of_streams, 
                                             executor_args...);
       }
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
+      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
     }
 
     // TODO add/rename into finalize?
     static void cleanup() {
       auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
+      assert(instance().streampools.size() == cppuddle::max_number_gpus);
       instance().streampools.clear();
     }
 
     static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
       assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].get_interface();
     }
     static void release_interface(size_t index, const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
       assert(gpu_id < instance().streampools.size());
       instance().streampools[gpu_id].release_interface(index);
     }
     static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
       assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].interface_available(load_limit);
     }
     static size_t get_current_load(const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
       assert(gpu_id < instance().streampools.size());
       return instance().streampools[gpu_id].get_current_load();
     }
     // TODO deprecated! Remove...
     /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
-    /*   std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
-    /*   assert(instance().streampools.size() == recycler::max_number_gpus); */
+    /*   std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
+    /*   assert(instance().streampools.size() == cppuddle::max_number_gpus); */
     /*   return instance().streampools[gpu_id].get_next_device_id(); */
     /* } */
 
@@ -274,15 +274,15 @@ class stream_pool {
 
   private:
     stream_pool_implementation() = default;
-    recycler::mutex_t pool_mut{};
+    cppuddle::mutex_t pool_mut{};
     std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
       // By default no multi gpu support
-      assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1);
+      assert(cppuddle::max_number_gpus == 1 || instance().streampools.size() == 1);
       assert(gpu_id == 0);
     };
 
     std::deque<Pool> streampools{};
-    std::array<recycler::mutex_t, recycler::max_number_gpus> gpu_mutexes;
+    std::array<cppuddle::mutex_t, cppuddle::max_number_gpus> gpu_mutexes;
 
     static stream_pool_implementation& instance(void) {
       static stream_pool_implementation pool_instance{};
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index f7971e1c..4da36df9 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -12,22 +12,27 @@ namespace recycler {
 
 namespace detail {
 
-[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
-template <class T> 
-using sycl_host_default_allocator = cppuddle::detail::sycl_host_default_allocator<T>;
+template <class T>
+using sycl_host_default_allocator
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::sycl_host_default_allocator<T>;
 
-[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
-template <class T> 
-using sycl_device_default_allocator = cppuddle::detail::sycl_device_default_allocator<T>;
+template <class T>
+using sycl_device_default_allocator
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::detail::sycl_device_default_allocator<T>;
 
 } // end namespace detail
 
-[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_sycl_host = cppuddle::recycle_allocator_sycl_host<T>;
-[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]]
+using recycle_allocator_sycl_host
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_sycl_host<T>;
+
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_sycl_device = cppuddle::recycle_allocator_sycl_device<T>;
+using recycle_allocator_sycl_device
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::recycle_allocator_sycl_device<T>;
 
 } // end namespace recycler
 #endif
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index 47fc83f4..2ac9ea17 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -21,6 +21,7 @@
 #include <hpx/timing/high_resolution_timer.hpp>
 #include <memory>
 
+#include "std_recycling_allocators.hpp"
 #include "cuda_recycling_allocators.hpp"
 #include "recycling_kokkos_view.hpp"
 
@@ -36,8 +37,8 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T **, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_host_view =
-    cppuddle::recycled_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
+using recycle_host_view =
+    cppuddle::recycle_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
 
 
 // Device views using recycle allocators
@@ -45,8 +46,8 @@ template <class T>
 using kokkos_um_device_array =
     Kokkos::View<T **, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_device_view =
-    cppuddle::recycled_view<kokkos_um_device_array<T>,
+using recycle_device_view =
+    cppuddle::recycle_view<kokkos_um_device_array<T>,
                             cppuddle::recycle_allocator_cuda_device<T>, T>;
 
 // Host views using pinned memory recycle allocators
@@ -55,8 +56,8 @@ using kokkos_um_pinned_array =
     Kokkos::View<T **, typename kokkos_um_device_array<T>::array_layout,
                  Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_pinned_view =
-    cppuddle::recycled_view<kokkos_um_pinned_array<T>,
+using recycle_pinned_view =
+    cppuddle::recycle_view<kokkos_um_pinned_array<T>,
                             cppuddle::recycle_allocator_cuda_host<T>, T>;
 
 template <typename Executor, typename ViewType>
@@ -81,7 +82,7 @@ int main(int argc, char *argv[]) {
   // Host run
   for (size_t pass = 0; pass < passes; pass++) {
     // Create view
-    recycled_host_view<double> hostView(view_size_0, view_size_1);
+    recycle_host_view<double> hostView(view_size_0, view_size_1);
 
     // Create executor
     hpx::kokkos::serial_executor executor;
@@ -109,7 +110,7 @@ int main(int argc, char *argv[]) {
   // Device run
   for (size_t pass = 0; pass < passes; pass++) {
     // Create and init host view
-    recycled_pinned_view<double> hostView(view_size_0, view_size_1);
+    recycle_pinned_view<double> hostView(view_size_0, view_size_1);
     for(size_t i = 0; i < view_size_0; i++) {
       for(size_t j = 0; j < view_size_1; j++) {
         hostView(i, j) = 1.0;
@@ -120,7 +121,7 @@ int main(int argc, char *argv[]) {
     hpx::kokkos::cuda_executor executor(hpx::kokkos::execution_space_mode::independent);
 
     // Use executor to move the host data to the device
-   recycled_device_view<double> deviceView(view_size_0, view_size_1);
+   recycle_device_view<double> deviceView(view_size_0, view_size_1);
    Kokkos::deep_copy(executor.instance(), deviceView, hostView); 
 
     auto policy_1 = Kokkos::Experimental::require(
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index c8045d3e..4826efec 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -21,6 +21,7 @@
 #include <boost/program_options.hpp>
 #include <memory>
 
+#include "std_recycling_allocators.hpp"
 #include "cuda_recycling_allocators.hpp"
 #include "recycling_kokkos_view.hpp"
 
@@ -32,8 +33,8 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_host_view =
-    cppuddle::recycled_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
+using recycle_host_view =
+    cppuddle::recycle_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
 
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
@@ -74,8 +75,8 @@ int main(int argc, char *argv[]) {
   hpx::kokkos::ScopeGuard scopeGuard(argc, argv);
   Kokkos::print_configuration(std::cout);
 
-  using test_view = recycled_host_view<float>;
-  using test_double_view = recycled_host_view<double>;
+  using test_view = recycle_host_view<float>;
+  using test_double_view = recycle_host_view<double>;
 
   constexpr size_t passes = 100;
   for (size_t pass = 0; pass < passes; pass++) {

From d5a3f79b9c14b870fce4ba6e4b7634be92f9dc5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Thu, 7 Mar 2024 17:37:31 +0100
Subject: [PATCH 06/19] Rework allocator namespace 4

---
 include/aggregation_manager.hpp               | 10 ++--
 include/aligned_buffer_util.hpp               |  4 +-
 include/aligned_recycling_allocators.hpp      |  3 ++
 include/buffer_management_interface.hpp       | 15 ++++--
 include/buffer_manager.hpp                    | 22 ++++----
 include/cuda_buffer_util.hpp                  | 12 ++---
 include/cuda_recycling_allocators.hpp         |  5 +-
 include/detail/buffer_management.hpp          | 10 ++--
 include/hip_buffer_util.hpp                   | 12 ++---
 include/hip_recycling_allocators.hpp          |  4 +-
 include/recycling_kokkos_view.hpp             | 52 ++++++++++---------
 include/std_recycling_allocators.hpp          |  2 +
 include/sycl_recycling_allocators.hpp         |  2 +
 tests/allocator_aligned_test.cpp              | 21 ++++----
 tests/allocator_hpx_test.cpp                  | 26 +++++-----
 ...llocator_kokkos_executor_for_loop_test.cpp | 21 ++++----
 tests/allocator_kokkos_test.cpp               |  6 +--
 tests/allocator_test.cpp                      | 18 ++++---
 18 files changed, 138 insertions(+), 107 deletions(-)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 99468658..1cbe09db 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -594,8 +594,9 @@ template <typename Executor> class Aggregated_Executor {
         // Buffer might be recycled from previous allocations by the
         // buffer_interface...
         T *aggregated_buffer =
-            cppuddle::detail::buffer_interface::get<T, Host_Allocator>(
-                size, manage_content_lifetime, location_id, gpu_id);
+            cppuddle::memory_recycling::detail::buffer_interface::get<
+                T, Host_Allocator>(size, manage_content_lifetime, location_id,
+                                   gpu_id);
         // Create buffer entry for this buffer
         buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
                                         size, 1, true, location_id, gpu_id);
@@ -670,8 +671,9 @@ template <typename Executor> class Aggregated_Executor {
       // it as invalid)
       if (valid) {
         assert(buffers_in_use == true);
-        cppuddle::detail::buffer_interface::mark_unused<T, Host_Allocator>(
-            buffer_pointer, buffer_size, location_id, gpu_id);
+        cppuddle::memory_recycling::detail::buffer_interface::mark_unused<
+            T, Host_Allocator>(buffer_pointer, buffer_size, location_id,
+                               gpu_id);
         // mark buffer as invalid to prevent any other slice from marking the
         // buffer as unused
         valid = false;
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index 0e5f1fb7..e4ef7990 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -14,13 +14,13 @@ template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_aligned
     [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_aligned<T, alignement>;
+        cppuddle::memory_recycling::recycle_aligned<T, alignement>;
 
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_aligned
     [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
-        cppuddle::aggressive_recycle_aligned<T, alignement>;
+        cppuddle::memory_recycling::aggressive_recycle_aligned<T, alignement>;
 
 } // namespace recycler
 
diff --git a/include/aligned_recycling_allocators.hpp b/include/aligned_recycling_allocators.hpp
index 039a19f2..ee0182bb 100644
--- a/include/aligned_recycling_allocators.hpp
+++ b/include/aligned_recycling_allocators.hpp
@@ -10,6 +10,7 @@
 #include "buffer_management_interface.hpp"
 
 namespace cppuddle {
+namespace memory_recycling {
 namespace device_selection {
 template <typename T, size_t alignement>
 /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
@@ -31,6 +32,8 @@ template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_aligned = detail::aggressive_recycle_allocator<
     T, boost::alignment::aligned_allocator<T, alignement>>;
+
+} // namespace memory_recycling
 } // namespace cppuddle
 
 #endif
diff --git a/include/buffer_management_interface.hpp b/include/buffer_management_interface.hpp
index b38fc84c..8614568b 100644
--- a/include/buffer_management_interface.hpp
+++ b/include/buffer_management_interface.hpp
@@ -4,18 +4,25 @@
 #include "detail/buffer_management.hpp"
 
 namespace cppuddle {
+namespace memory_recycling {
 
 /// Print performance counters of all buffer managers to stdout
-inline void print_buffer_counters() { detail::buffer_interface::print_performance_counters(); }
-/// Deletes all buffers (even ones still marked as used), delete the buffer
+inline void print_buffer_counters() {
+  detail::buffer_interface::print_performance_counters();
+}
+/// Deletes all buffers (even ones still marked as used), delete the buffer 
 /// managers and the recycler itself
 inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); }
+
 /// Deletes all buffers currently marked as unused
-inline void unused_buffer_cleanup() { detail::buffer_interface::clean_unused_buffers(); }
-/// Deletes all buffers (even ones still marked as used), delete the buffer
+inline void unused_buffer_cleanup() {
+  detail::buffer_interface::clean_unused_buffers();
+}
+/// Deletes all buffers (even ones still marked as used), delete the buffer 
 /// managers and the recycler itself. Disallows further usage.
 inline void finalize() { detail::buffer_interface::finalize(); }
 
+} // namespace memory_recycling
 } // end namespace cppuddle
 
 #endif
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index d9d74c31..25e5ce00 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -9,29 +9,29 @@ namespace recycler {
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_std
     [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_std<T>;
+        cppuddle::memory_recycling::recycle_std<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_aligned
     [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
-        cppuddle::aggressive_recycle_std<T>;
+        cppuddle::memory_recycling::aggressive_recycle_std<T>;
 
-[[deprecated("Use cppuddle::print_buffer_counters() instead")]] 
+[[deprecated("Use cppuddle::memory_recycling::print_buffer_counters() instead")]] 
 inline void print_performance_counters() {
-  cppuddle::print_buffer_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself
-[[deprecated("Use cppuddle::force_buffer_cleanup() instead")]]
-inline void force_cleanup() { cppuddle::force_buffer_cleanup(); }
+[[deprecated("Use cppuddle::memory_recycling::force_buffer_cleanup() instead")]]
+inline void force_cleanup() { cppuddle::memory_recycling::force_buffer_cleanup(); }
 /// Deletes all buffers currently marked as unused
-[[deprecated("Use cppuddle::unused_buffer_cleanup() instead")]]
-inline void cleanup() { cppuddle::unused_buffer_cleanup(); }
+[[deprecated("Use cppuddle::memory_recycling::unused_buffer_cleanup() instead")]]
+inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself. Disallows further usage.
-[[deprecated("Use cppuddle::finalize() instead")]]
-inline void finalize() { cppuddle::finalize(); }
+[[deprecated("Use cppuddle::memory_recycling::finalize() instead")]]
+inline void finalize() { cppuddle::memory_recycling::finalize(); }
 
-} // end namespace cppuddle
+} // namespace recycler
 
 #endif
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index ffe47f8b..6334da8a 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -14,35 +14,35 @@ namespace detail {
 template <class T>
 using cuda_pinned_allocator
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::cuda_pinned_allocator<T>;
+        cppuddle::memory_recycling::detail::cuda_pinned_allocator<T>;
 
 template <class T>
 using cuda_device_allocator
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::cuda_device_allocator<T>;
+        cppuddle::memory_recycling::detail::cuda_device_allocator<T>;
 
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_host
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_cuda_host<T>;
+        cppuddle::memory_recycling::recycle_allocator_cuda_host<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_device
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_cuda_device<T>;
+        cppuddle::memory_recycling::recycle_allocator_cuda_device<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using cuda_device_buffer
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::cuda_device_buffer<T>;
+        cppuddle::memory_recycling::cuda_device_buffer<T>;
 
 template <typename T, typename Host_Allocator,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using cuda_aggregated_device_buffer
     [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
-        cppuddle::cuda_aggregated_device_buffer<T, Host_Allocator>;
+        cppuddle::memory_recycling::cuda_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/cuda_recycling_allocators.hpp b/include/cuda_recycling_allocators.hpp
index b4cf8efb..911948a3 100644
--- a/include/cuda_recycling_allocators.hpp
+++ b/include/cuda_recycling_allocators.hpp
@@ -13,8 +13,9 @@
 #include "buffer_management_interface.hpp"
 
 namespace cppuddle {
-namespace detail {
+namespace memory_recycling {
 
+namespace detail {
 /// Underlying host allocator for CUDA pinned memory
 template <class T> struct cuda_pinned_allocator {
   using value_type = T;
@@ -168,5 +169,7 @@ struct cuda_aggregated_device_buffer {
   Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
                          // for the entire lifetime of this buffer
 };
+
+} // namespace memory_recycling
 } // end namespace cppuddle
 #endif
diff --git a/include/detail/buffer_management.hpp b/include/detail/buffer_management.hpp
index 5d640983..98504d21 100644
--- a/include/detail/buffer_management.hpp
+++ b/include/detail/buffer_management.hpp
@@ -45,6 +45,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #include "config.hpp"
 
 namespace cppuddle {
+namespace memory_recycling {
 
 namespace device_selection {
 /// Default device selector - No MultGPU support
@@ -415,8 +416,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
 
       // No unused buffer found -> Create new one and return it
       try {
-        cppuddle::device_selection::select_device_functor<T, Host_Allocator>{}(
-            device_id);
+        cppuddle::memory_recycling::device_selection::select_device_functor<
+            T, Host_Allocator>{}(device_id);
         Host_Allocator alloc;
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
@@ -441,8 +442,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON
         // If there still isn't enough memory left, the caller has to handle it
         // We've done all we can in here
         Host_Allocator alloc;
-        cppuddle::device_selection::select_device_functor<T, Host_Allocator>{}(
-            device_id);
+        cppuddle::memory_recycling::device_selection::select_device_functor<
+            T, Host_Allocator>{}(device_id);
         T *buffer = alloc.allocate(number_of_elements);
         instance()[location_id].buffer_map.insert(
             {buffer, std::make_tuple(buffer, number_of_elements, 1,
@@ -922,6 +923,7 @@ operator!=(aggressive_recycle_allocator<T, Host_Allocator> const &,
     return true;
 }
 } // namespace detail
+} // namespace memory_recycling
 } // end namespace cppuddle
 
 #endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index a2b5ca0c..eadedc07 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -15,34 +15,34 @@ namespace detail {
 template <class T>
 using hip_pinned_allocator
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::hip_pinned_allocator<T>;
+        cppuddle::memory_recycling::detail::hip_pinned_allocator<T>;
 
 template <class T>
 using hip_device_allocator
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::hip_device_allocator<T>;
+        cppuddle::memory_recycling::detail::hip_device_allocator<T>;
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_host
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_hip_host<T>;
+        cppuddle::memory_recycling::recycle_allocator_hip_host<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_device
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_hip_device<T>;
+        cppuddle::memory_recycling::recycle_allocator_hip_device<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using hip_device_buffer
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::hip_device_buffer<T>;
+        cppuddle::memory_recycling::hip_device_buffer<T>;
 
 template <typename T, typename Host_Allocator,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using hip_aggregated_device_buffer
     [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
-        cppuddle::hip_aggregated_device_buffer<T, Host_Allocator>;
+        cppuddle::memory_recycling::hip_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/hip_recycling_allocators.hpp b/include/hip_recycling_allocators.hpp
index f540b544..274fbb68 100644
--- a/include/hip_recycling_allocators.hpp
+++ b/include/hip_recycling_allocators.hpp
@@ -13,8 +13,9 @@
 #include "buffer_management_interface.hpp"
 
 namespace cppuddle {
-namespace detail {
+namespace memory_recycling {
 
+namespace detail {
 /// Underlying host allocator for HIP pinned memory
 template <class T> struct hip_pinned_allocator {
   using value_type = T;
@@ -175,5 +176,6 @@ struct hip_aggregated_device_buffer {
                          // for the entire lifetime of this buffer
 };
 
+} // namespace memory_recycling
 } // end namespace cppuddle
 #endif
diff --git a/include/recycling_kokkos_view.hpp b/include/recycling_kokkos_view.hpp
index c55d3738..86085fc8 100644
--- a/include/recycling_kokkos_view.hpp
+++ b/include/recycling_kokkos_view.hpp
@@ -13,6 +13,8 @@
 
 
 namespace cppuddle {
+namespace memory_recycling {
+
 
 template<typename element_type, typename alloc_type>
 struct view_deleter {
@@ -26,7 +28,7 @@ struct view_deleter {
 };
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class aggregated_recycle_view : public kokkos_type {
+class aggregated_recycling_view : public kokkos_type {
 private:
   alloc_type allocator;
   size_t total_elements{0};
@@ -36,7 +38,7 @@ class aggregated_recycle_view : public kokkos_type {
 public:
   using view_type = kokkos_type;
   template <class... Args>
-  explicit aggregated_recycle_view(alloc_type &alloc, Args... args)
+  explicit aggregated_recycling_view(alloc_type &alloc, Args... args)
       : kokkos_type(
             alloc.allocate(kokkos_type::required_allocation_size(args...) /
                            sizeof(element_type)),
@@ -47,15 +49,15 @@ class aggregated_recycle_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            alloc, total_elements)) {}
 
-  aggregated_recycle_view(
-      const aggregated_recycle_view<kokkos_type, alloc_type, element_type> &other)
+  aggregated_recycling_view(
+      const aggregated_recycling_view<kokkos_type, alloc_type, element_type> &other)
       : kokkos_type(other), allocator(other.allocator) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  aggregated_recycle_view<kokkos_type, alloc_type, element_type> &
-  operator=(const aggregated_recycle_view<kokkos_type, alloc_type, element_type> &other) {
+  aggregated_recycling_view<kokkos_type, alloc_type, element_type> &
+  operator=(const aggregated_recycling_view<kokkos_type, alloc_type, element_type> &other) {
     data_ref_counter = other.data_ref_counter;
     allocator = other.allocator;
     kokkos_type::operator=(other);
@@ -63,15 +65,15 @@ class aggregated_recycle_view : public kokkos_type {
     return *this;
   }
 
-  aggregated_recycle_view(
-      aggregated_recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+  aggregated_recycling_view(
+      aggregated_recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept
       : kokkos_type(other), allocator(other.allocator) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  aggregated_recycle_view<kokkos_type, alloc_type, element_type> &operator=(
-      aggregated_recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+  aggregated_recycling_view<kokkos_type, alloc_type, element_type> &operator=(
+      aggregated_recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
     data_ref_counter = other.data_ref_counter;
     allocator = other.allocator;
     kokkos_type::operator=(other);
@@ -79,12 +81,12 @@ class aggregated_recycle_view : public kokkos_type {
     return *this;
   }
 
-  ~aggregated_recycle_view() {}
+  ~aggregated_recycling_view() {}
 };
 
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class recycle_view : public kokkos_type {
+class recycling_view : public kokkos_type {
 private:
   size_t total_elements{0};
   std::shared_ptr<element_type> data_ref_counter;
@@ -94,7 +96,7 @@ class recycle_view : public kokkos_type {
   static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycle_view(Args... args)
+  recycling_view(Args... args)
       : kokkos_type(
             alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
@@ -106,7 +108,7 @@ class recycle_view : public kokkos_type {
 
   template <typename... Args,
             std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycle_view(const size_t device_id, Args... args)
+  recycling_view(const size_t device_id, Args... args)
       : kokkos_type(
             alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) /
                                sizeof(element_type)),
@@ -119,7 +121,7 @@ class recycle_view : public kokkos_type {
   template <
       typename layout_t,
       std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
-  recycle_view(std::size_t device_id, layout_t layout)
+  recycling_view(std::size_t device_id, layout_t layout)
       : kokkos_type(
             alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) /
                                sizeof(element_type)),
@@ -129,41 +131,41 @@ class recycle_view : public kokkos_type {
         data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
                                            alloc_type{device_id}, total_elements)) {}
 
-  recycle_view(
-      const recycle_view<kokkos_type, alloc_type, element_type> &other)
+  recycling_view(
+      const recycling_view<kokkos_type, alloc_type, element_type> &other)
       : kokkos_type(other) {
     total_elements = other.total_elements;
     data_ref_counter = other.data_ref_counter;
 
   }
 
-  recycle_view<kokkos_type, alloc_type, element_type> &
-  operator=(const recycle_view<kokkos_type, alloc_type, element_type> &other) {
+  recycling_view<kokkos_type, alloc_type, element_type> &
+  operator=(const recycling_view<kokkos_type, alloc_type, element_type> &other) {
     data_ref_counter = other.data_ref_counter;
     kokkos_type::operator=(other);
     total_elements = other.total_elements;
     return *this;
   }
 
-  recycle_view(
-      recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+  recycling_view(
+      recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept
       : kokkos_type(other) {
     data_ref_counter = other.data_ref_counter;
     total_elements = other.total_elements;
   }
 
-  recycle_view<kokkos_type, alloc_type, element_type> &operator=(
-      recycle_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+  recycling_view<kokkos_type, alloc_type, element_type> &operator=(
+      recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
     data_ref_counter = other.data_ref_counter;
     kokkos_type::operator=(other);
     total_elements = other.total_elements;
     return *this;
   }
 
-  ~recycle_view() {  }
+  ~recycling_view() {  }
 };
 
-
+} // namespace memory_recycling
 } // end namespace cppuddle
 
 #endif
diff --git a/include/std_recycling_allocators.hpp b/include/std_recycling_allocators.hpp
index a62390dd..141b0874 100644
--- a/include/std_recycling_allocators.hpp
+++ b/include/std_recycling_allocators.hpp
@@ -9,6 +9,7 @@
 #include "buffer_management_interface.hpp"
 
 namespace cppuddle {
+namespace memory_recycling {
 namespace device_selection {
 /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
 /// select_device_functor does not compile for > 1 GPU (to make sure all /
@@ -27,6 +28,7 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using aggressive_recycle_std =
     detail::aggressive_recycle_allocator<T, std::allocator<T>>;
 
+} // namespace memory_recycling
 } // namespace cppuddle
 
 #endif
diff --git a/include/sycl_recycling_allocators.hpp b/include/sycl_recycling_allocators.hpp
index 66ba1fb8..c4f47c31 100644
--- a/include/sycl_recycling_allocators.hpp
+++ b/include/sycl_recycling_allocators.hpp
@@ -13,6 +13,7 @@
 #include "buffer_management_interface.hpp"
 
 namespace cppuddle {
+namespace memory_recycling {
 
 namespace device_selection {
 // No MutliGPU support yet, hence no select_device_function required
@@ -85,5 +86,6 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_device =
     detail::recycle_allocator<T, detail::sycl_device_default_allocator<T>>;
 
+} // namespace memory_recycling
 } // end namespace cppuddle
 #endif
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index 65d1df64..510ac9cd 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -78,7 +78,9 @@ int main(int argc, char *argv[]) {
               << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, cppuddle::aggressive_recycle_aligned<double, 32>>
+      std::vector<
+          double,
+          cppuddle::memory_recycling::aggressive_recycle_aligned<double, 32>>
         test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
@@ -91,8 +93,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  cppuddle::print_buffer_counters();
-  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Recycle Test:
@@ -100,8 +102,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, cppuddle::recycle_aligned<double, 32>> test1(
-          array_size, double{});
+      std::vector<double,
+                  cppuddle::memory_recycling::recycle_aligned<double, 32>>
+        test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -113,8 +116,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  cppuddle::print_buffer_counters();
-  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Same test using std::allocator:
@@ -123,7 +126,7 @@ int main(int argc, char *argv[]) {
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
       std::vector<double, boost::alignment::aligned_allocator<double, 32>>
-      test2(array_size, double{});
+        test2(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       default_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -145,7 +148,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  cppuddle::print_buffer_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index 4af0878c..b94e305e 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -112,8 +112,8 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, cppuddle::recycle_std<double>> test6(array_size,
-                                                                     double{});
+            std::vector<double, cppuddle::memory_recycling::recycle_std<double>>
+              test6(array_size, double{});
           });
         }
       }
@@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Recycle allocation test took " << recycle_duration
                 << "ms" << std::endl;
     }
-    cppuddle::print_buffer_counters();
-    cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::memory_recycling::print_buffer_counters();
+    cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
     // ensure that at least 4 buffers have to created for unit testing
     {
-      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer1(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer1(
           array_size, double{});
-      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer2(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer2(
           array_size, double{});
-      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer3(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer3(
           array_size, double{});
-      std::vector<double, cppuddle::aggressive_recycle_std<double>> buffer4(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer4(
           array_size, double{});
     }
 
@@ -153,8 +153,10 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, cppuddle::aggressive_recycle_std<double>> test6(
-                array_size, double{});
+            std::vector<
+                double,
+                cppuddle::memory_recycling::aggressive_recycle_std<double>>
+                test6(array_size, double{});
           });
         }
       }
@@ -167,8 +169,8 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Aggressive recycle allocation test took "
                 << aggressive_duration << "ms" << std::endl;
     }
-    cppuddle::print_buffer_counters();
-    cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::memory_recycling::print_buffer_counters();
+    cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index 2ac9ea17..439eb374 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -37,18 +37,17 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T **, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycle_host_view =
-    cppuddle::recycle_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
-
+using recycle_host_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_array<T>, cppuddle::memory_recycling::recycle_std<T>, T>;
 
 // Device views using recycle allocators
 template <class T>
 using kokkos_um_device_array =
     Kokkos::View<T **, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycle_device_view =
-    cppuddle::recycle_view<kokkos_um_device_array<T>,
-                            cppuddle::recycle_allocator_cuda_device<T>, T>;
+using recycle_device_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_device_array<T>,
+    cppuddle::memory_recycling::recycle_allocator_cuda_device<T>, T>;
 
 // Host views using pinned memory recycle allocators
 template <class T>
@@ -56,9 +55,9 @@ using kokkos_um_pinned_array =
     Kokkos::View<T **, typename kokkos_um_device_array<T>::array_layout,
                  Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycle_pinned_view =
-    cppuddle::recycle_view<kokkos_um_pinned_array<T>,
-                            cppuddle::recycle_allocator_cuda_host<T>, T>;
+using recycle_pinned_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_pinned_array<T>,
+    cppuddle::memory_recycling::recycle_allocator_cuda_host<T>, T>;
 
 template <typename Executor, typename ViewType>
 auto get_iteration_policy(const Executor &&executor,
@@ -144,11 +143,11 @@ int main(int argc, char *argv[]) {
 
   // otherwise the HPX cuda polling futures won't work
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
-  cppuddle::print_buffer_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
   // Cleanup all cuda views 
   // (otherwise the cuda driver might shut down before this gets done automatically at
   // the end of the programm)
-  cppuddle::force_buffer_cleanup();
+  cppuddle::memory_recycling::force_buffer_cleanup();
   return hpx::finalize();
 }
 
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index 4826efec..b513289f 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -33,8 +33,8 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycle_host_view =
-    cppuddle::recycle_view<kokkos_um_array<T>, cppuddle::recycle_std<T>, T>;
+using recycle_host_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_array<T>, cppuddle::memory_recycling::recycle_std<T>, T>;
 
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
@@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
                         });
     Kokkos::fence();
   }
-  cppuddle::print_buffer_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 8fc7c5bb..8e13a6df 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -77,8 +77,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, cppuddle::aggressive_recycle_std<double>> test1(
-          array_size, double{});
+      std::vector<double,
+                  cppuddle::memory_recycling::aggressive_recycle_std<double>>
+      test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -89,8 +90,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  cppuddle::print_buffer_counters();
-  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for
                                     // better comparison
 
   // Recycle Test:
@@ -98,7 +99,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, cppuddle::recycle_std<double>> test1(array_size, double{});
+      std::vector<double, cppuddle::memory_recycling::recycle_std<double>>
+        test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -109,8 +111,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  cppuddle::print_buffer_counters();
-  cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for
                                     // better comparison
 
   // Same test using std::allocator:
@@ -139,7 +141,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  cppuddle::print_buffer_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else

From 38925759dda5bc75018a4d642042fb19d37115ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 8 Mar 2024 01:17:01 +0100
Subject: [PATCH 07/19] Move memory headers into new directory

---
 include/aggregation_manager.hpp               |  2 +-
 include/aligned_buffer_util.hpp               |  2 +-
 include/buffer_manager.hpp                    |  6 +++---
 .../{detail => cppuddle/common}/config.hpp    |  0
 .../aligned_recycling_allocators.hpp          |  1 +
 .../buffer_management_interface.hpp           |  0
 .../cuda_recycling_allocators.hpp             |  0
 .../detail/buffer_management.hpp              |  2 +-
 .../hip_recycling_allocators.hpp              |  0
 .../recycling_kokkos_view.hpp                 |  0
 .../std_recycling_allocators.hpp              |  1 +
 .../sycl_recycling_allocators.hpp             |  0
 include/cuda_buffer_util.hpp                  |  3 ++-
 include/hip_buffer_util.hpp                   |  2 +-
 include/kokkos_buffer_util.hpp                |  2 +-
 include/stream_manager.hpp                    | 20 +++++++++++++------
 include/sycl_buffer_util.hpp                  |  2 +-
 tests/allocator_aligned_test.cpp              |  3 ++-
 tests/allocator_hpx_test.cpp                  |  2 +-
 ...llocator_kokkos_executor_for_loop_test.cpp |  6 +++---
 tests/allocator_kokkos_test.cpp               |  6 +++---
 tests/allocator_test.cpp                      |  2 +-
 tests/stream_test.hpp                         |  4 ++--
 23 files changed, 39 insertions(+), 27 deletions(-)
 rename include/{detail => cppuddle/common}/config.hpp (100%)
 rename include/{ => cppuddle/memory_recycling}/aligned_recycling_allocators.hpp (99%)
 rename include/{ => cppuddle/memory_recycling}/buffer_management_interface.hpp (100%)
 rename include/{ => cppuddle/memory_recycling}/cuda_recycling_allocators.hpp (100%)
 rename include/{ => cppuddle/memory_recycling}/detail/buffer_management.hpp (99%)
 rename include/{ => cppuddle/memory_recycling}/hip_recycling_allocators.hpp (100%)
 rename include/{ => cppuddle/memory_recycling}/recycling_kokkos_view.hpp (100%)
 rename include/{ => cppuddle/memory_recycling}/std_recycling_allocators.hpp (99%)
 rename include/{ => cppuddle/memory_recycling}/sycl_recycling_allocators.hpp (100%)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 1cbe09db..9e40797c 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -50,7 +50,7 @@
 
 #include "../include/buffer_manager.hpp"
 #include "../include/stream_manager.hpp"
-#include "../include/detail/config.hpp"
+#include "cppuddle/common/config.hpp"
 
 #ifndef CPPUDDLE_HAVE_HPX_MUTEX
 #pragma message                                                                \
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index e4ef7990..84b9be19 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -6,7 +6,7 @@
 #ifndef ALIGNED_BUFFER_UTIL_HPP
 #define ALIGNED_BUFFER_UTIL_HPP
 
-#include "aligned_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp"
 
 namespace recycler {
 
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 25e5ce00..186808d2 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -1,8 +1,8 @@
-#ifndef BUFFER_MANAGER_INTERFACE_HPP
+#ifndef BUFFER_MANAGER_HPP
 #define BUFFER_MANAGER_HPP
 
-#include "buffer_management_interface.hpp"
-#include "std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/buffer_management_interface.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 namespace recycler {
 
diff --git a/include/detail/config.hpp b/include/cppuddle/common/config.hpp
similarity index 100%
rename from include/detail/config.hpp
rename to include/cppuddle/common/config.hpp
diff --git a/include/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
similarity index 99%
rename from include/aligned_recycling_allocators.hpp
rename to include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
index ee0182bb..8a9df8ec 100644
--- a/include/aligned_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
@@ -11,6 +11,7 @@
 
 namespace cppuddle {
 namespace memory_recycling {
+
 namespace device_selection {
 template <typename T, size_t alignement>
 /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
diff --git a/include/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp
similarity index 100%
rename from include/buffer_management_interface.hpp
rename to include/cppuddle/memory_recycling/buffer_management_interface.hpp
diff --git a/include/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
similarity index 100%
rename from include/cuda_recycling_allocators.hpp
rename to include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
diff --git a/include/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
similarity index 99%
rename from include/detail/buffer_management.hpp
rename to include/cppuddle/memory_recycling/detail/buffer_management.hpp
index 98504d21..6d95ab8f 100644
--- a/include/detail/buffer_management.hpp
+++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
@@ -42,7 +42,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #endif
 #endif
 
-#include "config.hpp"
+#include "cppuddle/common/config.hpp"
 
 namespace cppuddle {
 namespace memory_recycling {
diff --git a/include/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
similarity index 100%
rename from include/hip_recycling_allocators.hpp
rename to include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
diff --git a/include/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
similarity index 100%
rename from include/recycling_kokkos_view.hpp
rename to include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
diff --git a/include/std_recycling_allocators.hpp b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp
similarity index 99%
rename from include/std_recycling_allocators.hpp
rename to include/cppuddle/memory_recycling/std_recycling_allocators.hpp
index 141b0874..21fd5c2c 100644
--- a/include/std_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp
@@ -10,6 +10,7 @@
 
 namespace cppuddle {
 namespace memory_recycling {
+
 namespace device_selection {
 /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
 /// select_device_functor does not compile for > 1 GPU (to make sure all /
diff --git a/include/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
similarity index 100%
rename from include/sycl_recycling_allocators.hpp
rename to include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 6334da8a..7589993d 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -6,7 +6,8 @@
 #ifndef CUDA_BUFFER_UTIL_HPP
 #define CUDA_BUFFER_UTIL_HPP
 
-#include "cuda_recycling_allocators.hpp"
+#include "buffer_manager.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
 
 namespace recycler {
 namespace detail {
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index eadedc07..2912666f 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -6,7 +6,7 @@
 #ifndef HIP_BUFFER_UTIL_HPP
 #define HIP_BUFFER_UTIL_HPP
 
-#include "hip_recycling_allocators.hpp"
+#include "/cppuddle/memory_recycling/hip_recycling_allocators.hpp"
 
 namespace recycler {
 
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 7b267619..fc66e539 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -5,7 +5,7 @@
 
 #ifndef KOKKOS_BUFFER_UTIL_HPP
 #define KOKKOS_BUFFER_UTIL_HPP
-#include "recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
 
 
 namespace recycler {
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index bfaba518..4d681e1a 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -17,7 +17,7 @@
 #include <tuple>
 #include <type_traits>
 
-#include "../include/detail/config.hpp"
+#include "cppuddle/common/config.hpp"
 
 // Need to cuda/hip definitions for default params when NOT
 // drawing from an executor pool
@@ -39,6 +39,10 @@ enum class execution_space_mode { global, independent };
 #endif
 #endif
 
+/* namespace cppuddle { */
+/* namespace executor_recycling { */
+
+namespace detail {
 /// Turns a std::array_mutex into an scoped lock
 template<typename mutex_array_t>
 auto make_scoped_lock_from_array(mutex_array_t& mutexes)
@@ -46,6 +50,7 @@ auto make_scoped_lock_from_array(mutex_array_t& mutexes)
     return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
                       mutexes);
 }
+} // namespace detail
 
 template <class Interface> class round_robin_pool {
 private:
@@ -197,7 +202,7 @@ class stream_pool {
     static void init(size_t number_of_streams, Ts ... executor_args) {
       /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
       /*               "deprecated stream_pool::init does not support multigpu"); */
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().streampools.emplace_back(number_of_streams, executor_args...);
       assert(instance().streampools.size() <= cppuddle::max_number_gpus);
     }
@@ -205,7 +210,7 @@ class stream_pool {
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
     template <typename... Ts>
     static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
         for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
           instance().select_gpu_function(gpu_id);
@@ -220,7 +225,7 @@ class stream_pool {
     /// (useful for executor that expect an GPU-id during construction)
     template <typename... Ts>
     static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_streams > 0) {
         instance().select_gpu_function(gpu_id);
         instance().streampools.emplace_back(number_of_streams, 
@@ -231,7 +236,7 @@ class stream_pool {
 
     // TODO add/rename into finalize?
     static void cleanup() {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
       assert(instance().streampools.size() == cppuddle::max_number_gpus);
       instance().streampools.clear();
     }
@@ -264,7 +269,7 @@ class stream_pool {
     /* } */
 
     static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().select_gpu_function = select_gpu_function;
     }
 
@@ -410,4 +415,7 @@ template <class Interface, class Pool> class stream_interface {
 };
 #endif
 
+/* } // namespace executor_recycling */
+/* } // namespace cppuddle */
+
 #endif
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 4da36df9..7c88a4df 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -6,7 +6,7 @@
 #ifndef SYCL_BUFFER_UTIL_HPP
 #define SYCL_BUFFER_UTIL_HPP
 
-#include "sycl_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/sycl_recycling_allocators.hpp"
 
 namespace recycler {
 
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index 510ac9cd..ea9ce9a4 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -3,7 +3,6 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/aligned_recycling_allocators.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -17,6 +16,8 @@
 #include <string>
 #include <typeinfo>
 
+#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp"
+
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index b94e305e..21c4baed 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -15,7 +15,7 @@
 
 #include <boost/program_options.hpp>
 
-#include "std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 int hpx_main(int argc, char *argv[]) {
 
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index 439eb374..c38294d7 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -21,9 +21,9 @@
 #include <hpx/timing/high_resolution_timer.hpp>
 #include <memory>
 
-#include "std_recycling_allocators.hpp"
-#include "cuda_recycling_allocators.hpp"
-#include "recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
 
 // Assert during Release builds as well for this file:
 #undef NDEBUG
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index b513289f..5fb780e5 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -21,9 +21,9 @@
 #include <boost/program_options.hpp>
 #include <memory>
 
-#include "std_recycling_allocators.hpp"
-#include "cuda_recycling_allocators.hpp"
-#include "recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
 
 using kokkos_array =
     Kokkos::View<float[1000], Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 8e13a6df..9a44664f 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -16,7 +16,7 @@
 #include <string>
 #include <typeinfo>
 
-#include "std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index 07de4c44..2cfc5b07 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -9,8 +9,8 @@
 #include <hpx/async_base/async.hpp>
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cuda_buffer_util.hpp"
 
 template <typename Interface, typename Pool, typename... Ts>
 void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) {

From e2da83dd51511a82985659fedc5574df027e027c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 8 Mar 2024 02:50:12 +0100
Subject: [PATCH 08/19] Executor pool interface refactoring

---
 include/aggregation_manager.hpp               |   2 +-
 include/aligned_buffer_util.hpp               |   2 +-
 include/buffer_manager.hpp                    |   5 +
 include/cppuddle/common/config.hpp            |   2 +-
 .../executor_pools_management.hpp             | 421 +++++++++++++++++
 .../aligned_recycling_allocators.hpp          |   2 +-
 .../buffer_management_interface.hpp           |   5 +
 .../detail/buffer_management.hpp              |   2 +-
 .../hip_recycling_allocators.hpp              |   2 +-
 .../recycling_kokkos_view.hpp                 |   2 +-
 .../sycl_recycling_allocators.hpp             |   2 +-
 include/cuda_buffer_util.hpp                  |   2 +-
 include/hip_buffer_util.hpp                   |   2 +-
 include/kokkos_buffer_util.hpp                |   2 +-
 include/stream_manager.hpp                    | 437 +-----------------
 include/sycl_buffer_util.hpp                  |   2 +-
 tests/stream_test.cpp                         |  48 +-
 tests/stream_test.hpp                         | 168 ++++---
 18 files changed, 606 insertions(+), 502 deletions(-)
 create mode 100644 include/cppuddle/executor_recycling/executor_pools_management.hpp

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 9e40797c..70acfd61 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2023 Gregor Daiß
+// Copyright (c) 2022-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index 84b9be19..02a57104 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 186808d2..fb253990 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -1,3 +1,8 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
 #ifndef BUFFER_MANAGER_HPP
 #define BUFFER_MANAGER_HPP
 
diff --git a/include/cppuddle/common/config.hpp b/include/cppuddle/common/config.hpp
index 7115c790..c9a5f736 100644
--- a/include/cppuddle/common/config.hpp
+++ b/include/cppuddle/common/config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2023-2023 Gregor Daiß
+// Copyright (c) 2023-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/executor_recycling/executor_pools_management.hpp b/include/cppuddle/executor_recycling/executor_pools_management.hpp
new file mode 100644
index 00000000..16776031
--- /dev/null
+++ b/include/cppuddle/executor_recycling/executor_pools_management.hpp
@@ -0,0 +1,421 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef EXECUTOR_POOLS_MANAGEMENT_HPP
+#define EXECUTOR_POOLS_MANAGEMENT_HPP
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <deque>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <tuple>
+#include <type_traits>
+
+#include "cppuddle/common/config.hpp"
+
+// Need to cuda/hip definitions for default params when NOT
+// drawing from an executor pool
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+#include <hpx/config.hpp>
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+#include <hpx/async_cuda/cuda_executor.hpp>
+#endif
+#endif
+
+// Redefintion required for non-recycling executors
+// Without it, default constructing the executors (independent) would not work
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+// Do only define if Kokkos is not found
+#ifndef KOKKOS_ENABLE_SERIAL
+namespace hpx { namespace kokkos {
+enum class execution_space_mode { global, independent };
+}}
+#endif
+#endif
+
+namespace cppuddle {
+namespace executor_recycling {
+
+namespace detail {
+/// Turns a std::array_mutex into an scoped lock
+template<typename mutex_array_t>
+auto make_scoped_lock_from_array(mutex_array_t& mutexes)
+{
+    return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
+                      mutexes);
+}
+} // namespace detail
+
+template <typename Interface> class round_robin_pool_impl {
+private:
+  std::deque<Interface> pool{};
+  std::vector<size_t> ref_counters{};
+  size_t current_interface{0};
+
+public:
+  template <typename... Ts>
+  round_robin_pool_impl(size_t number_of_executors, Ts... executor_args) {
+    ref_counters.reserve(number_of_executors);
+    for (int i = 0; i < number_of_executors; i++) {
+      pool.emplace_back(executor_args...);
+      ref_counters.emplace_back(0);
+    }
+  }
+  // return a tuple with the interface and its index (to release it later)
+  std::tuple<Interface &, size_t> get_interface() {
+    assert(!(pool.empty())); 
+    size_t last_interface = current_interface;
+    current_interface = (current_interface + 1) % pool.size();
+    ref_counters[last_interface]++;
+    std::tuple<Interface &, size_t> ret(pool[last_interface], last_interface);
+    return ret;
+  }
+  void release_interface(size_t index) { ref_counters[index]--; }
+  bool interface_available(size_t load_limit) {
+    return *(std::min_element(std::begin(ref_counters),
+                              std::end(ref_counters))) < load_limit;
+  }
+  size_t get_current_load() {
+    return *(
+        std::min_element(std::begin(ref_counters), std::end(ref_counters)));
+  }
+  // TODO Remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
+};
+
+template <typename Interface> class priority_pool_impl {
+private:
+  std::deque<Interface> pool{};
+  std::vector<size_t> ref_counters{}; // Ref counters
+  std::vector<size_t> priorities{};   // Ref counters
+public:
+  template <typename... Ts>
+  priority_pool_impl(size_t number_of_executors, Ts... executor_args) {
+    ref_counters.reserve(number_of_executors);
+    priorities.reserve(number_of_executors);
+    for (auto i = 0; i < number_of_executors; i++) {
+      pool.emplace_back(executor_args...);
+      ref_counters.emplace_back(0);
+      priorities.emplace_back(i);
+    }
+  }
+  // return a tuple with the interface and its index (to release it later)
+  std::tuple<Interface &, size_t> get_interface() {
+    auto &interface = pool[priorities[0]];
+    ref_counters[priorities[0]]++;
+    std::tuple<Interface &, size_t> ret(interface, priorities[0]);
+    std::make_heap(std::begin(priorities), std::end(priorities),
+                   [this](const size_t &first, const size_t &second) -> bool {
+                     return ref_counters[first] > ref_counters[second];
+                   });
+    return ret;
+  }
+  void release_interface(size_t index) {
+    ref_counters[index]--;
+    std::make_heap(std::begin(priorities), std::end(priorities),
+                   [this](const size_t &first, const size_t &second) -> bool {
+                     return ref_counters[first] > ref_counters[second];
+                   });
+  }
+  bool interface_available(size_t load_limit) {
+    return ref_counters[priorities[0]] < load_limit;
+  }
+  size_t get_current_load() { return ref_counters[priorities[0]]; }
+  // TODO remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
+};
+
+/// Access/Concurrency Control for executor pool implementation
+class executor_pool {
+public:
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init(size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init(number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init_executor_pool(size_t pool_id, size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool> static void cleanup() {
+    executor_pool_implementation<Interface, Pool>::cleanup();
+  }
+  template <typename Interface, typename Pool>
+  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) {
+    return executor_pool_implementation<Interface, Pool>::get_interface(gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static void release_interface(size_t index, const size_t gpu_id) noexcept {
+    executor_pool_implementation<Interface, Pool>::release_interface(index,
+        gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept {
+    return executor_pool_implementation<Interface, Pool>::interface_available(
+        load_limit, gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static size_t get_current_load(const size_t gpu_id = 0) noexcept {
+    return executor_pool_implementation<Interface, Pool>::get_current_load(
+        gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static size_t get_next_device_id(const size_t number_gpus) noexcept {
+    // TODO add round robin and min strategy
+    return cppuddle::get_device_id(number_gpus);
+  }
+
+  template <typename Interface, typename Pool>
+  static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
+    executor_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
+  }
+
+  template <typename Interface, typename Pool>
+  static void select_device(size_t gpu_id) {
+    executor_pool_implementation<Interface, Pool>::select_device(gpu_id);
+  }
+
+private:
+  executor_pool() = default;
+
+private:
+  template <typename Interface, typename Pool> class executor_pool_implementation {
+  public:
+    /// Deprecated! Use init_on_all_gpu or init_on_gpu
+    template <typename... Ts>
+    static void init(size_t number_of_executors, Ts ... executor_args) {
+      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
+      /*               "deprecated executor_pool::init does not support multigpu"); */
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      instance().executorpools.emplace_back(number_of_executors, executor_args...);
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
+    template <typename... Ts>
+    static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) {
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_executors > 0) {
+        for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
+          instance().select_gpu_function(gpu_id);
+          instance().executorpools.emplace_back(number_of_executors,
+                                              executor_args...);
+        }
+      }
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    /// Per-GPU init allowing for different init parameters depending on the GPU 
+    /// (useful for executor that expect an GPU-id during construction)
+    template <typename... Ts>
+    static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) {
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_executors > 0) {
+        instance().select_gpu_function(gpu_id);
+        instance().executorpools.emplace_back(number_of_executors, 
+                                            executor_args...);
+      }
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    // TODO add/rename into finalize?
+    static void cleanup() {
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      assert(instance().executorpools.size() == cppuddle::max_number_gpus);
+      instance().executorpools.clear();
+    }
+
+    static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].get_interface();
+    }
+    static void release_interface(size_t index, const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      instance().executorpools[gpu_id].release_interface(index);
+    }
+    static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].interface_available(load_limit);
+    }
+    static size_t get_current_load(const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].get_current_load();
+    }
+    // TODO deprecated! Remove...
+    /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
+    /*   std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
+    /*   assert(instance().executorpools.size() == cppuddle::max_number_gpus); */
+    /*   return instance().executorpools[gpu_id].get_next_device_id(); */
+    /* } */
+
+    static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
+      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      instance().select_gpu_function = select_gpu_function;
+    }
+
+    static void select_device(size_t gpu_id) {
+      instance().select_gpu_function(gpu_id);
+    }
+
+  private:
+    executor_pool_implementation() = default;
+    cppuddle::mutex_t pool_mut{};
+    std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
+      // By default no multi gpu support
+      assert(cppuddle::max_number_gpus == 1 || instance().executorpools.size() == 1);
+      assert(gpu_id == 0);
+    };
+
+    std::deque<Pool> executorpools{};
+    std::array<cppuddle::mutex_t, cppuddle::max_number_gpus> gpu_mutexes;
+
+    static executor_pool_implementation& instance(void) {
+      static executor_pool_implementation pool_instance{};
+      return pool_instance;
+    }
+
+  public:
+    ~executor_pool_implementation() = default;
+    // Bunch of constructors we don't need
+    executor_pool_implementation(executor_pool_implementation const &other) =
+        delete;
+    executor_pool_implementation &
+    operator=(executor_pool_implementation const &other) = delete;
+    executor_pool_implementation(executor_pool_implementation &&other) = delete;
+    executor_pool_implementation &
+    operator=(executor_pool_implementation &&other) = delete;
+  };
+
+public:
+  ~executor_pool() = default;
+  // Bunch of constructors we don't need
+  executor_pool(executor_pool const &other) = delete;
+  executor_pool &operator=(executor_pool const &other) = delete;
+  executor_pool(executor_pool &&other) = delete;
+  executor_pool &operator=(executor_pool &&other) = delete;
+};
+
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+
+// Warn about suboptimal performance without recycling
+#pragma message                                                                \
+"Warning: Building without executor recycling! Use only for performance testing! \
+For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!"
+
+/// Slow version of the executor_interface that does not draw its
+/// executors (Interface) from the pool but creates them instead.
+/// Only meant for performance comparisons and only works with cuda/kokkos executors
+template <typename Interface, typename Pool> class executor_interface {
+public:
+
+  template <typename Dummy = Interface>
+  explicit executor_interface(size_t gpu_id,
+      std::enable_if_t<std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
+      : gpu_id(gpu_id), interface(gpu_id) {}
+  template <typename Dummy = Interface>
+  explicit executor_interface(std::enable_if_t<!std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
+      : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {}
+
+  executor_interface(const executor_interface &other) = delete;
+  executor_interface &operator=(const executor_interface &other) = delete;
+  executor_interface(executor_interface &&other) = delete;
+  executor_interface &operator=(executor_interface &&other) = delete;
+  ~executor_interface() {
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) post(F &&f, Ts &&... ts) {
+    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
+    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  inline decltype(auto) get_future() {
+    return interface.get_future();
+  }
+
+  // allow implict conversion
+  operator Interface &() { // NOLINT
+    return interface;
+  }
+
+private:
+  size_t gpu_id;
+
+public:
+  Interface interface;
+};
+#else
+/// Stream interface for RAII purposes
+/// Draws executor from the executor pool and releases it upon
+/// destruction
+template <typename Interface, typename Pool> class executor_interface {
+public:
+  explicit executor_interface(size_t gpu_id)
+      : t(executor_pool::get_interface<Interface, Pool>(gpu_id)),
+        interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {}
+
+  executor_interface(const executor_interface &other) = delete;
+  executor_interface &operator=(const executor_interface &other) = delete;
+  executor_interface(executor_interface &&other) = delete;
+  executor_interface &operator=(executor_interface &&other) = delete;
+  ~executor_interface() {
+    executor_pool::release_interface<Interface, Pool>(interface_index, gpu_id);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) post(F &&f, Ts &&... ts) {
+    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
+    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  inline decltype(auto) get_future() {
+    return interface.get_future();
+  }
+
+  // allow implict conversion
+  operator Interface &() { // NOLINT
+    return interface;
+  }
+
+private:
+  std::tuple<Interface &, size_t> t;
+  size_t interface_index;
+  size_t gpu_id;
+
+public:
+  Interface &interface;
+};
+#endif
+
+} // namespace executor_recycling
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
index 8a9df8ec..a824e7e0 100644
--- a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp
index 8614568b..c5fa44cd 100644
--- a/include/cppuddle/memory_recycling/buffer_management_interface.hpp
+++ b/include/cppuddle/memory_recycling/buffer_management_interface.hpp
@@ -1,3 +1,8 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
 #ifndef BUFFER_MANAGEMENT_INTERFACE_HPP
 #define BUFFER_MANAGEMENT_INTERFACE_HPP
 
diff --git a/include/cppuddle/memory_recycling/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
index 6d95ab8f..7c30c781 100644
--- a/include/cppuddle/memory_recycling/detail/buffer_management.hpp
+++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
index 274fbb68..d4b2da3c 100644
--- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c: 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
index 86085fc8..98ce2799 100644
--- a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
+++ b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
index c4f47c31..233afe71 100644
--- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c: 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 7589993d..8d004bef 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index 2912666f..9bc8ccc3 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c): 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index fc66e539..54736ebe 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 4d681e1a..940620d5 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -6,416 +6,29 @@
 #ifndef STREAM_MANAGER_HPP
 #define STREAM_MANAGER_HPP
 
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <deque>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <tuple>
-#include <type_traits>
-
-#include "cppuddle/common/config.hpp"
-
-// Need to cuda/hip definitions for default params when NOT
-// drawing from an executor pool
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-#include <hpx/config.hpp>
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-#include <hpx/async_cuda/cuda_executor.hpp>
-#endif
-#endif
-
-// Redefintion required for non-recycling executors
-// Without it, default constructing the executors (independent) would not work
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-// Do only define if Kokkos is not found
-#ifndef KOKKOS_ENABLE_SERIAL
-namespace hpx { namespace kokkos {
-enum class execution_space_mode { global, independent };
-}}
-#endif
-#endif
-
-/* namespace cppuddle { */
-/* namespace executor_recycling { */
-
-namespace detail {
-/// Turns a std::array_mutex into an scoped lock
-template<typename mutex_array_t>
-auto make_scoped_lock_from_array(mutex_array_t& mutexes)
-{
-    return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
-                      mutexes);
-}
-} // namespace detail
-
-template <class Interface> class round_robin_pool {
-private:
-  std::deque<Interface> pool{};
-  std::vector<size_t> ref_counters{};
-  size_t current_interface{0};
-
-public:
-  template <typename... Ts>
-  round_robin_pool(size_t number_of_streams, Ts... executor_args) {
-    ref_counters.reserve(number_of_streams);
-    for (int i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(executor_args...);
-      ref_counters.emplace_back(0);
-    }
-  }
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    assert(!(pool.empty())); 
-    size_t last_interface = current_interface;
-    current_interface = (current_interface + 1) % pool.size();
-    ref_counters[last_interface]++;
-    std::tuple<Interface &, size_t> ret(pool[last_interface], last_interface);
-    return ret;
-  }
-  void release_interface(size_t index) { ref_counters[index]--; }
-  bool interface_available(size_t load_limit) {
-    return *(std::min_element(std::begin(ref_counters),
-                              std::end(ref_counters))) < load_limit;
-  }
-  size_t get_current_load() {
-    return *(
-        std::min_element(std::begin(ref_counters), std::end(ref_counters)));
-  }
-  // TODO Remove
-  /* size_t get_next_device_id() { */
-  /*   return 0; // single gpu pool */
-  /* } */
-};
-
-template <class Interface> class priority_pool {
-private:
-  std::deque<Interface> pool{};
-  std::vector<size_t> ref_counters{}; // Ref counters
-  std::vector<size_t> priorities{};   // Ref counters
-public:
-  template <typename... Ts>
-  priority_pool(size_t number_of_streams, Ts... executor_args) {
-    ref_counters.reserve(number_of_streams);
-    priorities.reserve(number_of_streams);
-    for (auto i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(executor_args...);
-      ref_counters.emplace_back(0);
-      priorities.emplace_back(i);
-    }
-  }
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    auto &interface = pool[priorities[0]];
-    ref_counters[priorities[0]]++;
-    std::tuple<Interface &, size_t> ret(interface, priorities[0]);
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-    return ret;
-  }
-  void release_interface(size_t index) {
-    ref_counters[index]--;
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-  }
-  bool interface_available(size_t load_limit) {
-    return ref_counters[priorities[0]] < load_limit;
-  }
-  size_t get_current_load() { return ref_counters[priorities[0]]; }
-  // TODO remove
-  /* size_t get_next_device_id() { */
-  /*   return 0; // single gpu pool */
-  /* } */
-};
-
-/// Access/Concurrency Control for stream pool implementation
-class stream_pool {
-public:
-  template <class Interface, class Pool, typename... Ts>
-  static void init(size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init(number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool, typename... Ts>
-  static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool, typename... Ts>
-  static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool> static void cleanup() {
-    stream_pool_implementation<Interface, Pool>::cleanup();
-  }
-  template <class Interface, class Pool>
-  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) {
-    return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id);
-  }
-  template <class Interface, class Pool>
-  static void release_interface(size_t index, const size_t gpu_id) noexcept {
-    stream_pool_implementation<Interface, Pool>::release_interface(index,
-        gpu_id);
-  }
-  template <class Interface, class Pool>
-  static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept {
-    return stream_pool_implementation<Interface, Pool>::interface_available(
-        load_limit, gpu_id);
-  }
-  template <class Interface, class Pool>
-  static size_t get_current_load(const size_t gpu_id = 0) noexcept {
-    return stream_pool_implementation<Interface, Pool>::get_current_load(
-        gpu_id);
-  }
-  template <class Interface, class Pool>
-  static size_t get_next_device_id(const size_t number_gpus) noexcept {
-    // TODO add round robin and min strategy
-    return cppuddle::get_device_id(number_gpus);
-  }
-
-  template <class Interface, class Pool>
-  static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-    stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
-  }
-
-  template <class Interface, class Pool>
-  static void select_device(size_t gpu_id) {
-    stream_pool_implementation<Interface, Pool>::select_device(gpu_id);
-  }
-
-private:
-  stream_pool() = default;
-
-private:
-  template <class Interface, class Pool> class stream_pool_implementation {
-  public:
-    /// Deprecated! Use init_on_all_gpu or init_on_gpu
-    template <typename... Ts>
-    static void init(size_t number_of_streams, Ts ... executor_args) {
-      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
-      /*               "deprecated stream_pool::init does not support multigpu"); */
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
-      instance().streampools.emplace_back(number_of_streams, executor_args...);
-      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
-    }
-
-    /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
-    template <typename... Ts>
-    static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
-      if (number_of_streams > 0) {
-        for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
-          instance().select_gpu_function(gpu_id);
-          instance().streampools.emplace_back(number_of_streams,
-                                              executor_args...);
-        }
-      }
-      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
-    }
-
-    /// Per-GPU init allowing for different init parameters depending on the GPU 
-    /// (useful for executor that expect an GPU-id during construction)
-    template <typename... Ts>
-    static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
-      if (number_of_streams > 0) {
-        instance().select_gpu_function(gpu_id);
-        instance().streampools.emplace_back(number_of_streams, 
-                                            executor_args...);
-      }
-      assert(instance().streampools.size() <= cppuddle::max_number_gpus);
-    }
-
-    // TODO add/rename into finalize?
-    static void cleanup() {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
-      assert(instance().streampools.size() == cppuddle::max_number_gpus);
-      instance().streampools.clear();
-    }
-
-    static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
-      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].get_interface();
-    }
-    static void release_interface(size_t index, const size_t gpu_id = 0) {
-      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      instance().streampools[gpu_id].release_interface(index);
-    }
-    static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
-      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].interface_available(load_limit);
-    }
-    static size_t get_current_load(const size_t gpu_id = 0) {
-      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].get_current_load();
-    }
-    // TODO deprecated! Remove...
-    /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
-    /*   std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
-    /*   assert(instance().streampools.size() == cppuddle::max_number_gpus); */
-    /*   return instance().streampools[gpu_id].get_next_device_id(); */
-    /* } */
-
-    static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
-      instance().select_gpu_function = select_gpu_function;
-    }
-
-    static void select_device(size_t gpu_id) {
-      instance().select_gpu_function(gpu_id);
-    }
-
-  private:
-    stream_pool_implementation() = default;
-    cppuddle::mutex_t pool_mut{};
-    std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
-      // By default no multi gpu support
-      assert(cppuddle::max_number_gpus == 1 || instance().streampools.size() == 1);
-      assert(gpu_id == 0);
-    };
-
-    std::deque<Pool> streampools{};
-    std::array<cppuddle::mutex_t, cppuddle::max_number_gpus> gpu_mutexes;
-
-    static stream_pool_implementation& instance(void) {
-      static stream_pool_implementation pool_instance{};
-      return pool_instance;
-    }
-
-  public:
-    ~stream_pool_implementation() = default;
-    // Bunch of constructors we don't need
-    stream_pool_implementation(stream_pool_implementation const &other) =
-        delete;
-    stream_pool_implementation &
-    operator=(stream_pool_implementation const &other) = delete;
-    stream_pool_implementation(stream_pool_implementation &&other) = delete;
-    stream_pool_implementation &
-    operator=(stream_pool_implementation &&other) = delete;
-  };
-
-public:
-  ~stream_pool() = default;
-  // Bunch of constructors we don't need
-  stream_pool(stream_pool const &other) = delete;
-  stream_pool &operator=(stream_pool const &other) = delete;
-  stream_pool(stream_pool &&other) = delete;
-  stream_pool &operator=(stream_pool &&other) = delete;
-};
-
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-
-// Warn about suboptimal performance without recycling
-#pragma message                                                                \
-"Warning: Building without executor recycling! Use only for performance testing! \
-For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!"
-
-/// Slow version of the stream_interface that does not draw its
-/// executors (Interface) from the pool but creates them instead.
-/// Only meant for performance comparisons and only works with cuda/kokkos executors
-template <class Interface, class Pool> class stream_interface {
-public:
-
-  template <class Dummy = Interface>
-  explicit stream_interface(size_t gpu_id,
-      std::enable_if_t<std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
-      : gpu_id(gpu_id), interface(gpu_id) {}
-  template <class Dummy = Interface>
-  explicit stream_interface(std::enable_if_t<!std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
-      : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {}
-
-  stream_interface(const stream_interface &other) = delete;
-  stream_interface &operator=(const stream_interface &other) = delete;
-  stream_interface(stream_interface &&other) = delete;
-  stream_interface &operator=(stream_interface &&other) = delete;
-  ~stream_interface() {
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) post(F &&f, Ts &&... ts) {
-    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
-    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  inline decltype(auto) get_future() {
-    return interface.get_future();
-  }
-
-  // allow implict conversion
-  operator Interface &() { // NOLINT
-    return interface;
-  }
-
-private:
-  size_t gpu_id;
-
-public:
-  Interface interface;
-};
-#else
-/// Stream interface for RAII purposes
-/// Draws executor from the stream pool and releases it upon
-/// destruction
-template <class Interface, class Pool> class stream_interface {
-public:
-  explicit stream_interface(size_t gpu_id)
-      : t(stream_pool::get_interface<Interface, Pool>(gpu_id)),
-        interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {}
-
-  stream_interface(const stream_interface &other) = delete;
-  stream_interface &operator=(const stream_interface &other) = delete;
-  stream_interface(stream_interface &&other) = delete;
-  stream_interface &operator=(stream_interface &&other) = delete;
-  ~stream_interface() {
-    stream_pool::release_interface<Interface, Pool>(interface_index, gpu_id);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) post(F &&f, Ts &&... ts) {
-    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
-    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  inline decltype(auto) get_future() {
-    return interface.get_future();
-  }
-
-  // allow implict conversion
-  operator Interface &() { // NOLINT
-    return interface;
-  }
-
-private:
-  std::tuple<Interface &, size_t> t;
-  size_t interface_index;
-  size_t gpu_id;
-
-public:
-  Interface &interface;
-};
-#endif
-
-/* } // namespace executor_recycling */
-/* } // namespace cppuddle */
+#include "cppuddle/executor_recycling/executor_pools_management.hpp"
+
+template <typename Interface>
+using round_robin_pool
+    [[deprecated("Use cppuddle::executor_recycling::round_robin_pool_impl from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::round_robin_pool_impl<Interface>;
+
+template <typename Interface>
+using priority_pool
+    [[deprecated("Use cppuddle::executor_recycling::priority_pool_impl from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::priority_pool_impl<Interface>;
+
+using stream_pool
+    [[deprecated("Use cppuddle::executor_recycling::executor_pool from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::executor_pool;
+
+template <typename Interface, typename Pool>
+using stream_interface
+    [[deprecated("Use cppuddle::executor_recycling::executor_interface from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::executor_interface<Interface, Pool>;
 
 #endif
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 7c88a4df..ad64a9dc 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c: 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp
index 96599759..2e3ebf4c 100644
--- a/tests/stream_test.cpp
+++ b/tests/stream_test.cpp
@@ -1,10 +1,9 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
 #define USE_HPX_MAIN
-#include "../include/stream_manager.hpp"
 #include <hpx/async_cuda/cuda_executor.hpp>
 #ifdef USE_HPX_MAIN
 #include <hpx/hpx_init.hpp>
@@ -26,46 +25,49 @@ int main(int argc, char *argv[]) {
 #endif
   std::cout << "Starting ref counting tests ..." << std::endl;
   test_pool_ref_counting<hpx::cuda::experimental::cuda_executor,
-                         priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      round_robin_pool<hpx::cuda::experimental::cuda_executor>>(2, 0, false);
+                         cppuddle::executor_recycling::priority_pool_impl<
+                             hpx::cuda::experimental::cuda_executor>>(2, 0,
+                                                                      false);
+  test_pool_ref_counting<hpx::cuda::experimental::cuda_executor,
+                         cppuddle::executor_recycling::round_robin_pool_impl<
+                             hpx::cuda::experimental::cuda_executor>>(2, 0,
+                                                                      false);
   std::cout << "Finished ref counting tests!" << std::endl;
 
-
   std::cout << "Starting wrapper objects tests ..." << std::endl;
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
-                     priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                     cppuddle::executor_recycling::priority_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
-                     round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                     cppuddle::executor_recycling::round_robin_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   std::cout << "Finished wrapper objects tests!" << std::endl;
 
   std::cout << "Starting memcpy tests... " << std::endl;
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                   cppuddle::executor_recycling::round_robin_pool_impl<
+                       hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                   cppuddle::executor_recycling::priority_pool_impl<
+                       hpx::cuda::experimental::cuda_executor>>(2, 0, false);
 
   std::cout << "Finished memcpy tests! " << std::endl;
 
   std::cout << "Starting memcpy polling tests... " << std::endl;
   {
     // hpx::cuda::experimental::enable_user_polling polling_scope;
-    hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
+    hpx::cuda::experimental::detail::register_polling(
+        hpx::resource::get_thread_pool(0));
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                     round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-        2, 0, true);
+                     cppuddle::executor_recycling::round_robin_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, true);
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                     priority_pool<hpx::cuda::experimental::cuda_executor>>(
-        2, 0, true);
-    hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
+                     cppuddle::executor_recycling::priority_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, true);
+    hpx::cuda::experimental::detail::unregister_polling(
+        hpx::resource::get_thread_pool(0));
   }
-  recycler::force_cleanup();
+  cppuddle::memory_recycling::force_buffer_cleanup();
   std::cout << "Finished memcpy tests! " << std::endl;
   return hpx::finalize();
 }
diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index 2cfc5b07..1dfa60db 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,122 +10,180 @@
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
-#include "cuda_buffer_util.hpp"
+#include "cppuddle/executor_recycling/executor_pools_management.hpp""
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) {
-  std::vector<double, recycler::recycle_allocator_cuda_host<double>> hostbuffer(
-      512);
-  recycler::cuda_device_buffer<double> devicebuffer(512);
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) {
+  std::vector<double,
+              cppuddle::memory_recycling::recycle_allocator_cuda_host<double>>
+      hostbuffer(512);
+  cppuddle::memory_recycling::cuda_device_buffer<double> devicebuffer(512);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   // without interface wrapper
   {
-    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
+    auto test1 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
     Interface test1_interface = std::get<0>(test1);
     size_t interface_id = std::get<1>(test1);
-    hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
-                         hostbuffer.data(), 512 * sizeof(double),
-                         cudaMemcpyHostToDevice);
-    auto fut1 = hpx::async(test1_interface,
-        cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer,
-        512 * sizeof(double), cudaMemcpyDeviceToHost);
+    hpx::apply(test1_interface, cudaMemcpyAsync,
+               devicebuffer.device_side_buffer, hostbuffer.data(),
+               512 * sizeof(double), cudaMemcpyHostToDevice);
+    auto fut1 = hpx::async(test1_interface, cudaMemcpyAsync, hostbuffer.data(),
+                           devicebuffer.device_side_buffer,
+                           512 * sizeof(double), cudaMemcpyDeviceToHost);
     fut1.get();
-    stream_pool::release_interface<Interface, Pool>(interface_id, 0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        interface_id, 0);
   }
 
   // with interface wrapper
   {
-    stream_interface<Interface, Pool> test1_interface{0};
+    cppuddle::executor_recycling::executor_interface<Interface, Pool>
+        test1_interface{0};
     // hpx::cuda::cuda_executor test1_interface(0, false);
-    hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
-                         hostbuffer.data(), 512 * sizeof(double),
-                         cudaMemcpyHostToDevice);
-    auto fut1 = hpx::async(test1_interface.interface,
-        cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer,
-        512 * sizeof(double), cudaMemcpyDeviceToHost);
+    hpx::apply(test1_interface.interface, cudaMemcpyAsync,
+               devicebuffer.device_side_buffer, hostbuffer.data(),
+               512 * sizeof(double), cudaMemcpyHostToDevice);
+    auto fut1 = hpx::async(test1_interface.interface, cudaMemcpyAsync,
+                           hostbuffer.data(), devicebuffer.device_side_buffer,
+                           512 * sizeof(double), cudaMemcpyDeviceToHost);
     fut1.get();
   }
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) {
+void test_pool_ref_counting(const size_t executor_parameter, Ts &&...ts) {
 
   // init ppol
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   {
     // Allocating
-    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load1 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test1 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load1 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load1 == 0);
     Interface test1_interface = std::get<0>(test1);
     size_t test1_index = std::get<1>(test1);
-    auto test2 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load2 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test2 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load2 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load2 == 1);
     Interface test2_interface = std::get<0>(test2);
     // auto fut = test2_interface.get_future();
     size_t test2_index = std::get<1>(test2);
-    auto test3 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load3 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test3 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load3 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load3 == 1);
     Interface test3_interface = std::get<0>(test3);
     size_t test3_index = std::get<1>(test3);
-    auto test4 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load4 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test4 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load4 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     Interface test4_interface = std::get<0>(test4);
     size_t test4_index = std::get<1>(test4);
     assert(load4 == 2);
     // Releasing
-    stream_pool::release_interface<Interface, Pool>(test4_index, 0);
-    load4 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test4_index, 0);
+    load4 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load4 == 1);
-    stream_pool::release_interface<Interface, Pool>(test3_index, 0);
-    load3 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test3_index, 0);
+    load3 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load3 == 1);
-    stream_pool::release_interface<Interface, Pool>(test2_index, 0);
-    load2 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test2_index, 0);
+    load2 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load2 == 0);
-    stream_pool::release_interface<Interface, Pool>(test1_index, 0);
-    load1 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test1_index, 0);
+    load1 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load1 == 0);
   }
   // Clear
-  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
+  auto load0 =
+      cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                    Pool>(0);
   assert(load0 == 0);
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) {
-  using wrapper_type = stream_interface<Interface, Pool>;
+void test_pool_wrappers(const size_t executor_parameter, Ts &&...ts) {
+  using wrapper_type =
+      cppuddle::executor_recycling::executor_interface<Interface, Pool>;
   // init ppol
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   {
     wrapper_type test1{0};
-    auto load = stream_pool::get_current_load<Interface, Pool>(0);
+    auto load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 0);
     wrapper_type test2{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     // auto fut = test2.get_future();
     assert(load == 1);
     wrapper_type test3{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 1);
     wrapper_type test4{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 2);
     // Check availability method:
-    bool avail = stream_pool::interface_available<Interface, Pool>(1, 0);
+    bool avail =
+        cppuddle::executor_recycling::executor_pool::interface_available<
+            Interface, Pool>(1, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(2, 0);
+    avail = cppuddle::executor_recycling::executor_pool::interface_available<
+        Interface, Pool>(2, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(3, 0);
+    avail = cppuddle::executor_recycling::executor_pool::interface_available<
+        Interface, Pool>(3, 0);
     assert(avail == true); // NOLINT
   }
-  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
+  auto load0 =
+      cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                    Pool>(0);
   assert(load0 == 0);
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 #endif

From 3f88250b69a1c979312d1bc11e7252438718ee14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 8 Mar 2024 12:01:16 +0100
Subject: [PATCH 09/19] Move pool implementation into details

---
 .../executor_pools_management.hpp             | 14 ++++----
 .../executor_pools_interface.hpp              | 32 +++++++++++++++++++
 .../hip_recycling_allocators.hpp              |  2 +-
 .../sycl_recycling_allocators.hpp             |  2 +-
 include/hip_buffer_util.hpp                   |  2 +-
 include/stream_manager.hpp                    |  2 +-
 include/sycl_buffer_util.hpp                  |  2 +-
 tests/stream_test.hpp                         |  2 +-
 8 files changed, 45 insertions(+), 13 deletions(-)
 rename include/cppuddle/executor_recycling/{ => detail}/executor_pools_management.hpp (97%)
 create mode 100644 include/cppuddle/executor_recycling/executor_pools_interface.hpp

diff --git a/include/cppuddle/executor_recycling/executor_pools_management.hpp b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp
similarity index 97%
rename from include/cppuddle/executor_recycling/executor_pools_management.hpp
rename to include/cppuddle/executor_recycling/detail/executor_pools_management.hpp
index 16776031..6a89025b 100644
--- a/include/cppuddle/executor_recycling/executor_pools_management.hpp
+++ b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp
@@ -41,8 +41,8 @@ enum class execution_space_mode { global, independent };
 
 namespace cppuddle {
 namespace executor_recycling {
-
 namespace detail {
+
 /// Turns a std::array_mutex into an scoped lock
 template<typename mutex_array_t>
 auto make_scoped_lock_from_array(mutex_array_t& mutexes)
@@ -50,7 +50,6 @@ auto make_scoped_lock_from_array(mutex_array_t& mutexes)
     return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
                       mutexes);
 }
-} // namespace detail
 
 template <typename Interface> class round_robin_pool_impl {
 private:
@@ -202,7 +201,7 @@ class executor_pool {
     static void init(size_t number_of_executors, Ts ... executor_args) {
       /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
       /*               "deprecated executor_pool::init does not support multigpu"); */
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().executorpools.emplace_back(number_of_executors, executor_args...);
       assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
     }
@@ -210,7 +209,7 @@ class executor_pool {
     /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
     template <typename... Ts>
     static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_executors > 0) {
         for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
           instance().select_gpu_function(gpu_id);
@@ -225,7 +224,7 @@ class executor_pool {
     /// (useful for executor that expect an GPU-id during construction)
     template <typename... Ts>
     static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       if (number_of_executors > 0) {
         instance().select_gpu_function(gpu_id);
         instance().executorpools.emplace_back(number_of_executors, 
@@ -236,7 +235,7 @@ class executor_pool {
 
     // TODO add/rename into finalize?
     static void cleanup() {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       assert(instance().executorpools.size() == cppuddle::max_number_gpus);
       instance().executorpools.clear();
     }
@@ -269,7 +268,7 @@ class executor_pool {
     /* } */
 
     static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-      auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes);
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
       instance().select_gpu_function = select_gpu_function;
     }
 
@@ -415,6 +414,7 @@ template <typename Interface, typename Pool> class executor_interface {
 };
 #endif
 
+} // namespace detail
 } // namespace executor_recycling
 } // namespace cppuddle
 
diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
new file mode 100644
index 00000000..dac9f170
--- /dev/null
+++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
@@ -0,0 +1,32 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef EXECUTOR_POOLS_INTERFACE_HPP
+#define EXECUTOR_POOLS_INTERFACE_HPP
+
+#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp"
+
+namespace cppuddle {
+namespace executor_recycling {
+
+template <typename Interface>
+using round_robin_pool_impl =
+        detail::round_robin_pool_impl<Interface>;
+
+template <typename Interface>
+using priority_pool_impl =
+        detail::priority_pool_impl<Interface>;
+
+using executor_pool =
+        detail::executor_pool;
+
+template <typename Interface, typename Pool>
+using executor_interface =
+        detail::executor_interface<Interface, Pool>;
+
+}
+}
+
+#endif
diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
index d4b2da3c..36432820 100644
--- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2024 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
index 233afe71..7ea9999c 100644
--- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2020-2024 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index 9bc8ccc3..3f0b3034 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c): 2024 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 940620d5..25c4a080 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -6,7 +6,7 @@
 #ifndef STREAM_MANAGER_HPP
 #define STREAM_MANAGER_HPP
 
-#include "cppuddle/executor_recycling/executor_pools_management.hpp"
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp"
 
 template <typename Interface>
 using round_robin_pool
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index ad64a9dc..7ce66d93 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -1,4 +1,4 @@
-// Copyright (c: 2024 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index 1dfa60db..b793fe9c 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -10,7 +10,7 @@
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
-#include "cppuddle/executor_recycling/executor_pools_management.hpp""
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
 
 template <typename Interface, typename Pool, typename... Ts>
 void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) {

From 5740925f7d9b785ba9a500b0f08f58abd0fb8557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 8 Mar 2024 15:55:08 +0100
Subject: [PATCH 10/19] Move aggregation functionality into namespace

---
 include/aggregation_manager.hpp               | 1154 +---------------
 .../executor_pools_interface.hpp              |    4 +-
 .../kernel_aggregation_management.hpp         | 1161 +++++++++++++++++
 3 files changed, 1177 insertions(+), 1142 deletions(-)
 create mode 100644 include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 70acfd61..030150f9 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -1,1153 +1,27 @@
-// Copyright (c) 2022-2024 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef WORK_AGGREGATION_MANAGER
-#define WORK_AGGREGATION_MANAGER
+#ifndef AGGREGATION_MANAGER_HPP
+#define AGGREGATION_MANAGER_HPP
 
-#ifndef CPPUDDLE_HAVE_HPX
-#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON"
-#endif
-
-#include <stdexcept>
-//#define DEBUG_AGGREGATION_CALLS 1
-
-#include <stdio.h>
-
-#include <any>
-#include <atomic>
-#include <chrono>
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include <unordered_map>
-
-#include <hpx/futures/future.hpp>
-#include <hpx/hpx_init.hpp>
-#include <hpx/include/async.hpp>
-#include <hpx/include/iostreams.hpp>
-#include <hpx/include/lcos.hpp>
-#include <hpx/lcos/promise.hpp>
-#include <hpx/mutex.hpp>
-
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-// required for defining type traits using cuda executor as underlying
-// aggregation executors
-#include <hpx/async_cuda/cuda_executor.hpp>
-#endif
-
-#include <boost/core/demangle.hpp>
-#include <boost/format.hpp>
-
-#include "../include/buffer_manager.hpp"
-#include "../include/stream_manager.hpp"
-#include "cppuddle/common/config.hpp"
-
-#ifndef CPPUDDLE_HAVE_HPX_MUTEX
-#pragma message                                                                \
-    "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF"
-#pragma message                                                                \
-    "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex"
-#endif
-namespace cppuddle {
-  using aggregation_mutex_t = hpx::mutex;
-}
-
-//===============================================================================
-//===============================================================================
-// Helper functions/classes
-
-/// Constructs a tuple with copies (to store temporaries in aggregated function
-/// calls) yet also supporting references (on the users own risk...)
-template <typename... Ts>
-std::tuple<Ts...> make_tuple_supporting_references(Ts &&...ts) {
-  return std::tuple<Ts...>{std::forward<Ts>(ts)...};
-}
-
-/// Print some specific values that we can, but don't bother for most types
-/// (such as vector)
-template <typename T> std::string print_if_possible(T val) {
-  if constexpr (std::is_convertible_v<T, std::string>) {
-    return val;
-  } else if constexpr (std::is_integral_v<T> || std::is_floating_point_v<T>) {
-    return std::to_string(val);
-  } else if constexpr (std::is_pointer_v<T>) {
-    // Pretty printing pointer sort of only works well with %p
-    // TODO Try using std::format as soon as we can move to C++20
-    std::unique_ptr<char[]> debug_string(new char[128]());
-    snprintf(debug_string.get(), 128, "%p", val);
-    return std::string(debug_string.get());
-  } else {
-    return std::string("cannot print value");
-  }
-}
-
-/// Helper class for the helper class that prints tuples -- do not use this
-/// directly
-template <class TupType, size_t... I>
-void print_tuple(const TupType &_tup, std::index_sequence<I...>) {
-  (..., (hpx::cout << (I == 0 ? "" : ", ")
-                   << print_if_possible(std::get<I + 1>(_tup))));
-}
-
-/// Helper class for printing tuples (first component should be a function
-/// pointer, remaining components the function arguments)
-template <class... T> void print_tuple(const std::tuple<T...> &_tup) {
-  // Use pointer and sprintf as boost::format refused to NOT cast the pointer
-  // address to 1...
-  // TODO Try using std::format as soon as we can move to C++20
-  std::unique_ptr<char[]> debug_string(new char[128]());
-  snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (",
-           std::get<0>(_tup));
-  hpx::cout << debug_string.get();
-  print_tuple(_tup, std::make_index_sequence<sizeof...(T) - 1>());
-  hpx::cout << ")";
-}
-
-//===============================================================================
-//===============================================================================
-template <typename Executor, typename F, typename... Ts>
-void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) {
-  hpx::apply(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
-}
-
-template <typename Executor, typename F, typename... Ts>
-hpx::lcos::future<void> exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) {
-  return hpx::async(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
-}
-
-/// Manages the launch conditions for aggregated function calls
-/// type/value-errors
-/** Launch conditions: All slice executors must have called the same function
- * (tracked by future all_slices_ready)
- * AND
- * Previous aggregated_function_call on the same Executor must have been
- * launched (tracked by future stream_future)
- * All function calls received from the slice executors are checked if they
- * match the first one in both types and values (throws exception otherwise)
- */
-
-template <typename Executor> class aggregated_function_call {
-private:
-  std::atomic<size_t> slice_counter = 0;
-
-  /// Promise to be set when all slices have visited this function call
-  /* hpx::lcos::local::promise<void> slices_ready_promise; */
-  /// Tracks if all slices have visited this function call
-  /* hpx::lcos::future<void> all_slices_ready = slices_ready_promise.get_future(); */
-  /// How many slices can we expect?
-  const size_t number_slices;
-  const bool async_mode;
-
-  Executor &underlying_executor;
-
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-#pragma message                                                                \
-    "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..."
-  /// Stores the function call of the first slice as reference for error
-  /// checking
-  std::any function_tuple;
-  /// Stores the string of the first function call for debug output
-  std::string debug_type_information;
-  cppuddle::aggregation_mutex_t debug_mut;
-#endif
+#include "cppuddle/kernel_aggregation/kernel_aggregation_management.hpp"
 
-  std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
+using Aggregated_Executor_Modes =
+    cppuddle::kernel_aggregation::aggregated_executor_modes;
 
-public:
-  aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec)
-      : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) {
-    if (async_mode)
-      potential_async_promises.resize(number_slices);
-  }
-  ~aggregated_function_call(void) {
-    // All slices should have done this call
-    assert(slice_counter == number_slices);
-    // assert(!all_slices_ready.valid());
-  }
-  /// Returns true if all required slices have visited this point
-  bool sync_aggregation_slices(hpx::lcos::future<void> &stream_future) {
-    assert(!async_mode);
-    assert(potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    if (local_counter == number_slices - 1) {
-      return true;
-    }
-    else return false;
-  }
-  template <typename F, typename... Ts>
-  void post_when(hpx::lcos::future<void> &stream_future, F &&f, Ts &&...ts) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-    // needed for concurrent access to function_tuple and debug_type_information
-    // Not required for normal use
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(debug_mut);
-#endif
-    assert(!async_mode);
-    assert(potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-
-    if (local_counter == 0) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto tmp_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      function_tuple = tmp_tuple;
-      debug_type_information = typeid(decltype(tmp_tuple)).name();
-#endif
-
-    } else {
-      //
-      // This scope checks if both the type and the values of the current call
-      // match the original call To be used in debug build...
-      //
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto comparison_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      try {
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        if (comparison_tuple != orig_call_tuple) {
-          throw std::runtime_error(
-              "Values of post function arguments (or function "
-              "itself) do not match ");
-        }
-      } catch (const std::bad_any_cast &e) {
-        hpx::cout
-            << "\nMismatched types error in aggregated post call of executor "
-            << ": " << e.what() << "\n";
-        hpx::cout << "Expected types:\t\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        hpx::cout << "\nGot types:\t\t "
-                  << boost::core::demangle(
-                         typeid(decltype(comparison_tuple)).name())
-                  << "\n"
-                  << std::endl;
-        // throw;
-      } catch (const std::runtime_error &e) {
-        hpx::cout
-            << "\nMismatched values error in aggregated post call of executor "
-            << ": " << e.what() << std::endl;
-        hpx::cout << "Types (matched):\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        hpx::cout << "\nExpected values:\t ";
-        print_tuple(orig_call_tuple);
-        hpx::cout << "\nGot values:\t\t ";
-        print_tuple(comparison_tuple);
-        hpx::cout << std::endl << std::endl;
-        // throw;
-      }
-#endif
-    }
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    // Check exit criteria: Launch function call continuation by setting the
-    // slices promise
-    if (local_counter == number_slices - 1) {
-      exec_post_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
-      //slices_ready_promise.set_value();
-    }
-  }
-  template <typename F, typename... Ts>
-  hpx::lcos::future<void> async_when(hpx::lcos::future<void> &stream_future,
-                                     F &&f, Ts &&...ts) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-    // needed for concurrent access to function_tuple and debug_type_information
-    // Not required for normal use
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(debug_mut);
-#endif
-    assert(async_mode);
-    assert(!potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    if (local_counter == 0) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto tmp_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      function_tuple = tmp_tuple;
-      debug_type_information = typeid(decltype(tmp_tuple)).name();
-#endif
-    } else {
-      //
-      // This scope checks if both the type and the values of the current call
-      // match the original call To be used in debug build...
-      //
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto comparison_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      try {
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        if (comparison_tuple != orig_call_tuple) {
-          throw std::runtime_error(
-              "Values of async function arguments (or function "
-              "itself) do not match ");
-        }
-      } catch (const std::bad_any_cast &e) {
-        hpx::cout
-            << "\nMismatched types error in aggregated async call of executor "
-            << ": " << e.what() << "\n";
-        hpx::cout << "Expected types:\t\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        hpx::cout << "\nGot types:\t\t "
-                  << boost::core::demangle(
-                         typeid(decltype(comparison_tuple)).name())
-                  << "\n"
-                  << std::endl;
-        // throw;
-      } catch (const std::runtime_error &e) {
-        hpx::cout
-            << "\nMismatched values error in aggregated async call of executor "
-            << ": " << e.what() << std::endl;
-        hpx::cout << "Types (matched):\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        hpx::cout << "\nExpected values:\t ";
-        print_tuple(orig_call_tuple);
-        hpx::cout << "\nGot values:\t\t ";
-        print_tuple(comparison_tuple);
-        hpx::cout << std::endl << std::endl;
-        // throw;
-      }
-#endif
-    }
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    assert(potential_async_promises.size() == number_slices);
-    hpx::lcos::future<void> ret_fut =
-        potential_async_promises[local_counter].get_future();
-    if (local_counter == number_slices - 1) {
-      /* slices_ready_promise.set_value(); */
-      auto fut = exec_async_wrapper<Executor, F, Ts...>(
-          underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
-      fut.then([this](auto &&fut) {
-        for (auto &promise : potential_async_promises) {
-          promise.set_value();
-        }
-      });
-    }
-    // Check exit criteria: Launch function call continuation by setting the
-    // slices promise
-    return ret_fut;
-  }
-  template <typename F, typename... Ts>
-  hpx::lcos::shared_future<void> wrap_async(hpx::lcos::future<void> &stream_future,
-                                     F &&f, Ts &&...ts) {
-    assert(async_mode);
-    assert(!potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    assert(potential_async_promises.size() == number_slices);
-    hpx::lcos::shared_future<void> ret_fut =
-        potential_async_promises[local_counter].get_shared_future();
-    if (local_counter == number_slices - 1) {
-      auto fut = f(std::forward<Ts>(ts)...);
-      fut.then([this](auto &&fut) {
-        // TODO just use one promise
-        for (auto &promise : potential_async_promises) {
-          promise.set_value();
-        }
-      });
-    }
-    return ret_fut;
-  }
-  // We need to be able to copy or no-except move for std::vector..
-  aggregated_function_call(const aggregated_function_call &other) = default;
-  aggregated_function_call &
-  operator=(const aggregated_function_call &other) = default;
-  aggregated_function_call(aggregated_function_call &&other) = default;
-  aggregated_function_call &
-  operator=(aggregated_function_call &&other) = default;
-};
-
-//===============================================================================
-//===============================================================================
-
-enum class Aggregated_Executor_Modes { EAGER = 1, STRICT, ENDLESS };
-/// Declaration since the actual allocator is only defined after the Executors
 template <typename T, typename Host_Allocator, typename Executor>
-class Allocator_Slice;
-
-/// Executor Class that aggregates function calls for specific kernels
-/** Executor is not meant to be used directly. Instead it yields multiple
- * Executor_Slice objects. These serve as interfaces. Slices from the same
- * Aggregated_Executor are meant to execute the same function calls but on
- * different data (i.e. different tasks)
- */
-template <typename Executor> class Aggregated_Executor {
-private:
-  //===============================================================================
-  // Misc private avariables:
-  //
-  std::atomic<bool> slices_exhausted;
-
-  std::atomic<bool> executor_slices_alive;
-  std::atomic<bool> buffers_in_use;
-  std::atomic<size_t> dealloc_counter;
-
-  const Aggregated_Executor_Modes mode;
-  const size_t max_slices;
-  std::atomic<size_t> current_slices;
-  /// Wrapper to the executor interface from the stream pool
-  /// Automatically hooks into the stream_pools reference counting
-  /// for cpu/gpu load balancing
-  std::unique_ptr<stream_interface<Executor, round_robin_pool<Executor>>> executor_wrapper;
-
-public:
-  size_t gpu_id;
-  // Subclasses
-
-  /// Slice class - meant as a scope interface to the aggregated executor
-  class Executor_Slice {
-  public:
-    Aggregated_Executor<Executor> &parent;
-  private:
-    /// Executor is a slice of this aggregated_executor
-    /// How many functions have been called - required to enforce sequential
-    /// behaviour of kernel launches
-    size_t launch_counter{0};
-    size_t buffer_counter{0};
-    bool notify_parent_about_destruction{true};
-
-  public:
-    /// How many slices are there overall - required to check the launch
-    /// criteria
-    const size_t number_slices;
-    const size_t id;
-    using executor_t = Executor;
-    Executor_Slice(Aggregated_Executor &parent, const size_t slice_id,
-                   const size_t number_slices)
-        : parent(parent), notify_parent_about_destruction(true),
-          number_slices(number_slices), id(slice_id) {
-  }
-    ~Executor_Slice(void) {
-      // Don't notify parent if we moved away from this executor_slice
-      if (notify_parent_about_destruction) {
-        // Executor should be done by the time of destruction
-        // -> check here before notifying parent
-
-        // parent still in execution mode?
-        assert(parent.slices_exhausted == true);
-        // all kernel launches done?
-        assert(launch_counter == parent.function_calls.size());
-        // Notifiy parent that this aggregation slice is one
-        parent.reduce_usage_counter();
-      }
-    }
-    Executor_Slice(const Executor_Slice &other) = delete;
-    Executor_Slice &operator=(const Executor_Slice &other) = delete;
-    Executor_Slice(Executor_Slice &&other)
-        : parent(other.parent), launch_counter(std::move(other.launch_counter)),
-          buffer_counter(std::move(other.buffer_counter)),
-          number_slices(std::move(other.number_slices)),
-          id(std::move(other.id)) {
-      other.notify_parent_about_destruction = false;
-    }
-    Executor_Slice &operator=(Executor_Slice &&other) {
-      parent = other.parent;
-      launch_counter = std::move(other.launch_counter);
-      buffer_counter = std::move(other.buffer_counter);
-      number_slices = std::move(other.number_slices);
-      id = std::move(other.id);
-      other.notify_parent_about_destruction = false;
-    }
-    template <typename T, typename Host_Allocator>
-    Allocator_Slice<T, Host_Allocator, Executor> make_allocator() {
-      return Allocator_Slice<T, Host_Allocator, Executor>(*this);
-    }
-    bool sync_aggregation_slices() {
-      assert(parent.slices_exhausted == true);
-      auto ret = parent.sync_aggregation_slices(launch_counter);
-      launch_counter++;
-      return ret;
-    }
-    template <typename F, typename... Ts> void post(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      parent.post(launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-    }
-    template <typename F, typename... Ts>
-    hpx::lcos::future<void> async(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      hpx::lcos::future<void> ret_fut = parent.async(
-          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-      return ret_fut;
-    }
-
-    // OneWay Execution
-    template <typename F, typename... Ts>
-    friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t,
-        Executor_Slice& exec, F&& f, Ts&&... ts)
-    {
-        return exec.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-    }
-
-    // TwoWay Execution
-    template <typename F, typename... Ts>
-    friend decltype(auto) tag_invoke(
-        hpx::parallel::execution::async_execute_t, Executor_Slice& exec,
-        F&& f, Ts&&... ts)
-    {
-        return exec.async(
-            std::forward<F>(f), std::forward<Ts>(ts)...);
-    }
-
-    template <typename F, typename... Ts>
-    hpx::lcos::shared_future<void> wrap_async(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      hpx::lcos::shared_future<void> ret_fut = parent.wrap_async(
-          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-      return ret_fut;
-    }
-
-    /// Get new aggregated buffer (might have already been allocated been
-    /// allocated by different slice)
-    template <typename T, typename Host_Allocator> T *get(const size_t size) {
-      assert(parent.slices_exhausted == true);
-      T *aggregated_buffer =
-          parent.get<T, Host_Allocator>(size, buffer_counter);
-      buffer_counter++;
-      assert(buffer_counter > 0);
-      return aggregated_buffer;
-    }
-
-    Executor& get_underlying_executor(void) {
-      assert(parent.executor_wrapper);
-      return *(parent.executor_wrapper);
-    }
-  };
-
-  //===============================================================================
+using Allocator_Slice =
+    cppuddle::kernel_aggregation::allocator_slice<T, Host_Allocator, Executor>;
 
-  hpx::lcos::local::promise<void> slices_full_promise;
-  /// Promises with the slice executors -- to be set when the starting criteria
-  /// is met
-  std::vector<hpx::lcos::local::promise<Executor_Slice>> executor_slices;
-  /// List of aggregated function calls - function will be launched when all
-  /// slices have called it
-  std::deque<aggregated_function_call<Executor>> function_calls;
-  /// For synchronizing the access to the function calls list
-  cppuddle::aggregation_mutex_t mut;
-
-  /// Data entry for a buffer allocation: void* pointer, size_t for
-  /// buffer-size, atomic for the slice counter, location_id, gpu_id
-  using buffer_entry_t =
-      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>;
-  /// Keeps track of the aggregated buffer allocations done in all the slices
-  std::deque<buffer_entry_t> buffer_allocations;
-  /// Map pointer to deque index for fast access in the deallocations
-  std::unordered_map<void*,size_t> buffer_allocations_map;
-  /// For synchronizing the access to the buffer_allocations
-  cppuddle::aggregation_mutex_t buffer_mut;
-  std::atomic<size_t> buffer_counter = 0;
-
-  /// Get new buffer OR get buffer already allocated by different slice
-  template <typename T, typename Host_Allocator>
-  T *get(const size_t size, const size_t slice_alloc_counter) {
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    assert(executor_slices_alive == true);
-    // Add aggreated buffer entry in case it hasn't happened yet for this call
-    // First: Check if it already has happened
-    if (buffer_counter <= slice_alloc_counter) {
-      // we might be the first! Lock...
-      std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
-      // ... and recheck
-      if (buffer_counter <= slice_alloc_counter) {
-        constexpr bool manage_content_lifetime = false;
-        buffers_in_use = true;
-
-        // Default location -- useful for GPU builds as we otherwise create way too
-        // many different buffers for different aggregation sizes on different GPUs
-        /* size_t location_id = gpu_id * instances_per_gpu; */
-        // Use integer conversion to only use 0 16 32 ... as buckets
-        size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; 
-#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
-        if (max_slices == 1) {
-          // get prefered location: aka the current hpx threads location
-          // Usually handy for CPU builds where we want to use the buffers
-          // close to the current CPU core
-          /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
-          /* location_id = (gpu_id) * instances_per_gpu; */
-          // division makes sure that we always use the same instance to store our gpu buffers.
-        }
-#endif
-        // Get shiny and new buffer that will be shared between all slices
-        // Buffer might be recycled from previous allocations by the
-        // buffer_interface...
-        T *aggregated_buffer =
-            cppuddle::memory_recycling::detail::buffer_interface::get<
-                T, Host_Allocator>(size, manage_content_lifetime, location_id,
-                                   gpu_id);
-        // Create buffer entry for this buffer
-        buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
-                                        size, 1, true, location_id, gpu_id);
-
-#ifndef NDEBUG
-        // if previousely used the buffer should not be in usage anymore
-        const auto exists = buffer_allocations_map.count(
-            static_cast<void *>(aggregated_buffer));
-        if (exists > 0) {
-          const auto previous_usage_id =
-              buffer_allocations_map[static_cast<void *>(aggregated_buffer)];
-          const auto &valid =
-              std::get<3>(buffer_allocations[previous_usage_id]);
-          assert(!valid);
-        }
-#endif
-        buffer_allocations_map.insert_or_assign(static_cast<void *>(aggregated_buffer),
-            buffer_counter);
-
-        assert (buffer_counter == slice_alloc_counter);
-        buffer_counter = buffer_allocations.size();
-
-        // Return buffer
-        return aggregated_buffer;
-      }
-    }
-    assert(buffers_in_use == true);
-    assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid
-    assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1);
-
-    // Buffer entry should already exist:
-    T *aggregated_buffer = static_cast<T *>(
-        std::get<0>(buffer_allocations[slice_alloc_counter]));
-    // Error handling: Size is wrong?
-    assert(size == std::get<1>(buffer_allocations[slice_alloc_counter]));
-    // Notify that one more slice has visited this buffer allocation
-    std::get<2>(buffer_allocations[slice_alloc_counter])++;
-    return aggregated_buffer;
-  }
-
-  /// Notify buffer list that one slice is done with the buffer
-  template <typename T, typename Host_Allocator>
-  void mark_unused(T *p, const size_t size) {
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-
-    void *ptr_key = static_cast<void*>(p);
-    size_t slice_alloc_counter = buffer_allocations_map[p];
-
-    assert(slice_alloc_counter < buffer_allocations.size());
-    /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] =
-        buffer_allocations[slice_alloc_counter];*/
-    auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]);
-    const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]);
-    auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]);
-    auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]);
-    const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]);
-    const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]);
-    assert(valid);
-    T *buffer_pointer = static_cast<T *>(buffer_pointer_void);
-
-    assert(buffer_size == size);
-    assert(p == buffer_pointer);
-    // assert(buffer_pointer == p || buffer_pointer == nullptr);
-    // Slice is done with this buffer
-    buffer_allocation_counter--;
-    // Check if all slices are done with this buffer?
-    if (buffer_allocation_counter == 0) {
-      // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
-      std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
-      // Only mark unused if another buffer has not done so already (and marked
-      // it as invalid)
-      if (valid) {
-        assert(buffers_in_use == true);
-        cppuddle::memory_recycling::detail::buffer_interface::mark_unused<
-            T, Host_Allocator>(buffer_pointer, buffer_size, location_id,
-                               gpu_id);
-        // mark buffer as invalid to prevent any other slice from marking the
-        // buffer as unused
-        valid = false;
-
-        const size_t current_deallocs = ++dealloc_counter;
-        if (current_deallocs == buffer_counter) {
-          std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-          buffers_in_use = false;
-          if (!executor_slices_alive && !buffers_in_use) {
-            slices_exhausted = false;
-            // Release executor
-            executor_wrapper.reset(nullptr);
-          }
-        }
-      }
-    }
-  }
-
-  //===============================================================================
-  // Public Interface
-public:
-  hpx::lcos::future<void> current_continuation;
-  hpx::lcos::future<void> last_stream_launch_done;
-  std::atomic<size_t> overall_launch_counter = 0;
-
-  /// Only meant to be accessed by the slice executors
-  bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, false, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].sync_aggregation_slices(
-            last_stream_launch_done);
-      }
-    }
-
-    return function_calls[slice_launch_counter].sync_aggregation_slices(
-        last_stream_launch_done);
-  }
-
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, false, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        function_calls[slice_launch_counter].post_when(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-        return;
-      }
-    }
-
-    function_calls[slice_launch_counter].post_when(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-    return;
-  }
-
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
-                                Ts &&...ts) {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, true, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].async_when(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-      }
-    }
-
-    return function_calls[slice_launch_counter].async_when(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
-                                Ts &&...ts) {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, true, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].wrap_async(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-      }
-    }
-
-    return function_calls[slice_launch_counter].wrap_async(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  bool slice_available(void) {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    return !slices_exhausted;
-  }
-
-  std::optional<hpx::lcos::future<Executor_Slice>> request_executor_slice() {
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-    if (!slices_exhausted) {
-      const size_t local_slice_id = ++current_slices;
-      if (local_slice_id == 1) {
-        // Cleanup leftovers from last run if any
-        // TODO still required? Should be clean here already
-        function_calls.clear();
-        overall_launch_counter = 0;
-        std::lock_guard<cppuddle::aggregation_mutex_t> guard(buffer_mut);
-#ifndef NDEBUG
-        for (const auto &buffer_entry : buffer_allocations) {
-          const auto &[buffer_pointer_any, buffer_size,
-                       buffer_allocation_counter, valid, location_id, device_id] =
-              buffer_entry;
-          assert(!valid);
-        }
-#endif 
-        buffer_allocations.clear();
-        buffer_allocations_map.clear();
-        buffer_counter = 0;
-
-        assert(executor_slices_alive == false);
-        assert(buffers_in_use == false);
-        executor_slices_alive = true;
-        buffers_in_use = false;
-        dealloc_counter = 0;
-
-        if (mode == Aggregated_Executor_Modes::STRICT ) {
-          slices_full_promise = hpx::lcos::local::promise<void>{};
-        }
-      }
-
-      // Create Executor Slice future -- that will be returned later
-      hpx::lcos::future<Executor_Slice> ret_fut;
-      if (local_slice_id < max_slices) {
-        executor_slices.emplace_back(hpx::lcos::local::promise<Executor_Slice>{});
-        ret_fut =
-            executor_slices[local_slice_id - 1].get_future();
-      } else {
-        launched_slices = current_slices;
-        ret_fut = hpx::make_ready_future(Executor_Slice{*this,
-            executor_slices.size(), launched_slices});
-      }
-
-      // Are we the first slice? If yes, add continuation set the
-      // Executor_Slice
-      // futures to ready if the launch conditions are met
-      if (local_slice_id == 1) {
-        // Redraw executor
-        assert(!executor_wrapper);
-        stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
-        executor_wrapper.reset(
-            new stream_interface<Executor, round_robin_pool<Executor>>(gpu_id));
-        // Renew promise that all slices will be ready as the primary launch
-        // criteria...
-        hpx::lcos::shared_future<void> fut;
-        if (mode == Aggregated_Executor_Modes::EAGER ||
-            mode == Aggregated_Executor_Modes::ENDLESS) {
-          // Fallback launch condidtion: Launch as soon as the underlying stream
-          // is ready
-          /* auto slices_full_fut = slices_full_promise.get_future(); */
-          stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
-          auto exec_fut = (*executor_wrapper).get_future(); 
-          /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */
-          fut = std::move(exec_fut);
-        } else {
-          auto slices_full_fut = slices_full_promise.get_shared_future();
-          // Just use the slices launch condition
-          fut = std::move(slices_full_fut);
-        }
-        // Launch all executor slices within this continuation
-        current_continuation = fut.then([this](auto &&fut) {
-          std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-          slices_exhausted = true;
-          launched_slices = current_slices;
-          size_t id = 0;
-          for (auto &slice_promise : executor_slices) {
-            slice_promise.set_value(
-                Executor_Slice{*this, id, launched_slices});
-            id++;
-          }
-          executor_slices.clear();
-        });
-      }
-      if (local_slice_id >= max_slices &&
-          mode != Aggregated_Executor_Modes::ENDLESS) {
-        slices_exhausted = true; // prevents any more threads from entering
-                                 // before the continuation is launched
-        /* launched_slices = current_slices; */
-        /* size_t id = 0; */
-        /* for (auto &slice_promise : executor_slices) { */
-        /*   slice_promise.set_value( */
-        /*       Executor_Slice{*this, id, launched_slices}); */
-        /*   id++; */
-        /* } */
-        /* executor_slices.clear(); */
-        if (mode == Aggregated_Executor_Modes::STRICT ) {
-          slices_full_promise.set_value(); // Trigger slices launch condition continuation 
-        }
-        // that continuation will set all executor slices so far handed out to ready
-      }
-      return ret_fut;
-    } else {
-      // Return empty optional as failure
-      return std::optional<hpx::lcos::future<Executor_Slice>>{};
-    }
-  }
-  size_t launched_slices;
-  void reduce_usage_counter(void) {
-    /* std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut); */
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    assert(executor_slices_alive == true);
-    assert(launched_slices >= 1);
-    assert(current_slices >= 0 && current_slices <= launched_slices);
-    const size_t local_slice_id = --current_slices;
-    // Last slice goes out scope?
-    if (local_slice_id == 0) {
-      // Mark executor fit for reusage
-      std::lock_guard<cppuddle::aggregation_mutex_t> guard(mut);
-      executor_slices_alive = false; 
-      if (!executor_slices_alive && !buffers_in_use) {
-        // Release executor
-        slices_exhausted = false;
-        executor_wrapper.reset(nullptr);
-      }
-    }
-  }
-  ~Aggregated_Executor(void) {
-
-    assert(current_slices == 0);
-    assert(executor_slices_alive == false);
-    assert(buffers_in_use == false);
-
-    if (mode != Aggregated_Executor_Modes::STRICT ) {
-        slices_full_promise.set_value(); // Trigger slices launch condition continuation 
-    }
-
-    // Cleanup leftovers from last run if any
-    function_calls.clear();
-    overall_launch_counter = 0;
-#ifndef NDEBUG
-    for (const auto &buffer_entry : buffer_allocations) {
-      const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter,
-                   valid, location_id, device_id] = buffer_entry;
-      assert(!valid);
-    }
-#endif
-    buffer_allocations.clear();
-    buffer_allocations_map.clear();
-    buffer_counter = 0;
-
-    assert(buffer_allocations.empty());
-    assert(buffer_allocations_map.empty());
-  }
-
-  Aggregated_Executor(const size_t number_slices,
-                      Aggregated_Executor_Modes mode, const size_t gpu_id = 0)
-      : max_slices(number_slices), current_slices(0), slices_exhausted(false),
-        dealloc_counter(0), mode(mode), executor_slices_alive(false),
-        buffers_in_use(false), gpu_id(gpu_id),
-        executor_wrapper(nullptr),
-        current_continuation(hpx::make_ready_future()),
-        last_stream_launch_done(hpx::make_ready_future()) {}
-  // Not meant to be copied or moved
-  Aggregated_Executor(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor(Aggregated_Executor &&other) = delete;
-  Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete;
-};
-
-template <typename T, typename Host_Allocator, typename Executor>
-class Allocator_Slice {
-private:
-  typename Aggregated_Executor<Executor>::Executor_Slice &executor_reference;
-  Aggregated_Executor<Executor> &executor_parent;
-
-public:
-  using value_type = T;
-  Allocator_Slice(
-      typename Aggregated_Executor<Executor>::Executor_Slice &executor)
-      : executor_reference(executor), executor_parent(executor.parent) {}
-  template <typename U>
-  explicit Allocator_Slice(
-      Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data = executor_reference.template get<T, Host_Allocator>(n);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    /* executor_reference.template mark_unused<T, Host_Allocator>(p, n); */
-    executor_parent.template mark_unused<T, Host_Allocator>(p, n);
-  }
-  template <typename... Args>
-  inline void construct(T *p, Args... args) noexcept {
-    // Do nothing here - we reuse the content of the last owner
-  }
-  void destroy(T *p) {
-    // Do nothing here - Contents will be destroyed when the buffer manager is
-    // destroyed, not before
-  }
-};
-template <typename T, typename U, typename Host_Allocator, typename Executor>
-constexpr bool
-operator==(Allocator_Slice<T, Host_Allocator, Executor> const &,
-           Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {
-  return false;
-}
-template <typename T, typename U, typename Host_Allocator, typename Executor>
-constexpr bool
-operator!=(Allocator_Slice<T, Host_Allocator, Executor> const &,
-           Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {
-  return true;
-}
-
-namespace hpx { namespace parallel { namespace execution {
-   // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with 
-   // compatible unlying executor types
-    /* template<typename E> */
-    /* struct is_one_way_executor<typename Aggregated_Executor<E>::Executor_Slice> */
-    /*   : std::true_type */
-    /* {}; */
-    /* template<typename E> */
-    /* struct is_two_way_executor<typename Aggregated_Executor<E>::Executor_Slice> */
-    /*   : std::true_type */
-    /* {}; */
-
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-    // Workaround for the meantime: Manually create traits for compatible types:
-    template<>
-    struct is_one_way_executor<typename Aggregated_Executor<hpx::cuda::experimental::cuda_executor>::Executor_Slice>
-      : std::true_type
-    {};
-    template<>
-    struct is_two_way_executor<typename Aggregated_Executor<hpx::cuda::experimental::cuda_executor>::Executor_Slice>
-      : std::true_type
-    {};
-#endif
-}}}
-
-//===============================================================================
-//===============================================================================
-// Pool Strategy:
+template <typename Executor>
+using Aggregated_Executor =
+    cppuddle::kernel_aggregation::Aggregated_Executor<Executor>;
 
 template <const char *kernelname, class Interface, class Pool>
-class aggregation_pool {
-public:
-  /// interface
-  template <typename... Ts>
-  static void init(size_t number_of_executors, size_t slices_per_executor,
-                   Aggregated_Executor_Modes mode, size_t num_devices = 1) {
-    if (is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to initialize cppuddle aggregation pool twice") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    if (num_devices > cppuddle::max_number_gpus) {
-      throw std::runtime_error(
-          std::string(
-              "Trying to initialize aggregation with more devices than the "
-              "maximum number of GPUs given at compiletime") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    number_devices = num_devices;
-    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
-
-      std::lock_guard<cppuddle::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-      assert(instance()[gpu_id].aggregation_executor_pool.empty());
-      for (int i = 0; i < number_of_executors; i++) {
-        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
-                                                        mode, gpu_id);
-      }
-      instance()[gpu_id].slices_per_executor = slices_per_executor;
-      instance()[gpu_id].mode = mode;
-    }
-    is_initialized = true;
-  }
-
-  /// Will always return a valid executor slice
-  static decltype(auto) request_executor_slice(void) {
-    if (!is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to use cppuddle aggregation pool without first calling init") +
-          " Agg poolname: " + std::string(kernelname));
-    }
-    const size_t gpu_id = cppuddle::get_device_id(number_devices);
-    /* const size_t gpu_id = 1; */
-    std::lock_guard<cppuddle::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
-    std::optional<hpx::lcos::future<
-        typename Aggregated_Executor<Interface>::Executor_Slice>>
-        ret;
-    size_t local_id = (instance()[gpu_id].current_interface) %
-                      instance()[gpu_id].aggregation_executor_pool.size();
-    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-    // Expected case: current aggregation executor is free
-    if (ret.has_value()) {
-      return ret;
-    }
-    // current interface is bad -> find free one
-    size_t abort_counter = 0;
-    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
-    do {
-      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
-                 instance()[gpu_id].aggregation_executor_pool.size();
-      ret =
-          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-      if (ret.has_value()) {
-        return ret;
-      }
-      abort_counter++;
-    } while (abort_counter <= abort_number);
-    // Everything's busy -> create new aggregation executor (growing pool) OR
-    // return empty optional
-    if (instance()[gpu_id].growing_pool) {
-      instance()[gpu_id].aggregation_executor_pool.emplace_back(
-          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
-      instance()[gpu_id].current_interface =
-          instance()[gpu_id].aggregation_executor_pool.size() - 1;
-      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
-      ret = instance()[gpu_id]
-                .aggregation_executor_pool[instance()[gpu_id].current_interface]
-                .request_executor_slice();
-      assert(ret.has_value()); // fresh executor -- should always have slices
-                               // available
-    }
-    return ret;
-  }
-
-private:
-  std::deque<Aggregated_Executor<Interface>> aggregation_executor_pool;
-  std::atomic<size_t> current_interface{0};
-  size_t slices_per_executor;
-  Aggregated_Executor_Modes mode;
-  bool growing_pool{true};
-
-private:
-  /// Required for dealing with adding elements to the deque of
-  /// aggregated_executors
-  cppuddle::aggregation_mutex_t pool_mutex;
-  /// Global access instance
-  static std::unique_ptr<aggregation_pool[]>& instance(void) {
-    static std::unique_ptr<aggregation_pool[]> pool_instances{
-        new aggregation_pool[cppuddle::max_number_gpus]};
-    return pool_instances;
-  }
-  static inline size_t number_devices = 1;
-  static inline bool is_initialized = false;
-  aggregation_pool() = default;
-
-public:
-  ~aggregation_pool() = default;
-  // Bunch of constructors we don't need
-  aggregation_pool(aggregation_pool const &other) = delete;
-  aggregation_pool &operator=(aggregation_pool const &other) = delete;
-  aggregation_pool(aggregation_pool &&other) = delete;
-  aggregation_pool &operator=(aggregation_pool &&other) = delete;
-};
+using aggregation_pool =
+    cppuddle::kernel_aggregation::aggregation_pool<kernelname, Interface,
+    Pool>;
 
 #endif
diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
index dac9f170..49a6d42d 100644
--- a/include/cppuddle/executor_recycling/executor_pools_interface.hpp
+++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
@@ -26,7 +26,7 @@ template <typename Interface, typename Pool>
 using executor_interface =
         detail::executor_interface<Interface, Pool>;
 
-}
-}
+} // end namespace executor_recycling
+} // end namespace cppuddle
 
 #endif
diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp
new file mode 100644
index 00000000..fd5a8e77
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp
@@ -0,0 +1,1161 @@
+// Copyright (c) 2022-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef KERNEL_AGGREGATION_MANAGEMENT_HPP
+#define KERNEL_AGGREGATION_MANAGEMENT_HPP
+
+#ifndef CPPUDDLE_HAVE_HPX
+#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON"
+#endif
+
+#include <stdexcept>
+//#define DEBUG_AGGREGATION_CALLS 1
+
+#include <stdio.h>
+
+#include <any>
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <unordered_map>
+
+#include <hpx/futures/future.hpp>
+#include <hpx/hpx_init.hpp>
+#include <hpx/include/async.hpp>
+#include <hpx/include/iostreams.hpp>
+#include <hpx/include/lcos.hpp>
+#include <hpx/lcos/promise.hpp>
+#include <hpx/mutex.hpp>
+
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+// required for defining type traits using cuda executor as underlying
+// aggregation executors
+#include <hpx/async_cuda/cuda_executor.hpp>
+#endif
+
+#include <boost/core/demangle.hpp>
+#include <boost/format.hpp>
+
+#include "../include/buffer_manager.hpp"
+#include "../include/stream_manager.hpp"
+#include "cppuddle/common/config.hpp"
+
+#ifndef CPPUDDLE_HAVE_HPX_MUTEX
+#pragma message                                                                \
+    "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF"
+#pragma message                                                                \
+    "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex"
+#endif
+namespace cppuddle {
+namespace kernel_aggregation {
+  using aggregation_mutex_t = hpx::mutex;
+
+//===============================================================================
+//===============================================================================
+// Helper functions/classes
+
+/// Constructs a tuple with copies (to store temporaries in aggregated function
+/// calls) yet also supporting references (on the users own risk...)
+template <typename... Ts>
+std::tuple<Ts...> make_tuple_supporting_references(Ts &&...ts) {
+  return std::tuple<Ts...>{std::forward<Ts>(ts)...};
+}
+
+/// Print some specific values that we can, but don't bother for most types
+/// (such as vector)
+template <typename T> std::string print_if_possible(T val) {
+  if constexpr (std::is_convertible_v<T, std::string>) {
+    return val;
+  } else if constexpr (std::is_integral_v<T> || std::is_floating_point_v<T>) {
+    return std::to_string(val);
+  } else if constexpr (std::is_pointer_v<T>) {
+    // Pretty printing pointer sort of only works well with %p
+    // TODO Try using std::format as soon as we can move to C++20
+    std::unique_ptr<char[]> debug_string(new char[128]());
+    snprintf(debug_string.get(), 128, "%p", val);
+    return std::string(debug_string.get());
+  } else {
+    return std::string("cannot print value");
+  }
+}
+
+/// Helper class for the helper class that prints tuples -- do not use this
+/// directly
+template <class TupType, size_t... I>
+void print_tuple(const TupType &_tup, std::index_sequence<I...>) {
+  (..., (hpx::cout << (I == 0 ? "" : ", ")
+                   << print_if_possible(std::get<I + 1>(_tup))));
+}
+
+/// Helper class for printing tuples (first component should be a function
+/// pointer, remaining components the function arguments)
+template <class... T> void print_tuple(const std::tuple<T...> &_tup) {
+  // Use pointer and sprintf as boost::format refused to NOT cast the pointer
+  // address to 1...
+  // TODO Try using std::format as soon as we can move to C++20
+  std::unique_ptr<char[]> debug_string(new char[128]());
+  snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (",
+           std::get<0>(_tup));
+  hpx::cout << debug_string.get();
+  print_tuple(_tup, std::make_index_sequence<sizeof...(T) - 1>());
+  hpx::cout << ")";
+}
+
+//===============================================================================
+//===============================================================================
+template <typename Executor, typename F, typename... Ts>
+void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) {
+  hpx::apply(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
+}
+
+template <typename Executor, typename F, typename... Ts>
+hpx::lcos::future<void> exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) {
+  return hpx::async(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
+}
+
+/// Manages the launch conditions for aggregated function calls
+/// type/value-errors
+/** Launch conditions: All slice executors must have called the same function
+ * (tracked by future all_slices_ready)
+ * AND
+ * Previous aggregated_function_call on the same Executor must have been
+ * launched (tracked by future stream_future)
+ * All function calls received from the slice executors are checked if they
+ * match the first one in both types and values (throws exception otherwise)
+ */
+
+template <typename Executor> class aggregated_function_call {
+private:
+  std::atomic<size_t> slice_counter = 0;
+
+  /// Promise to be set when all slices have visited this function call
+  /* hpx::lcos::local::promise<void> slices_ready_promise; */
+  /// Tracks if all slices have visited this function call
+  /* hpx::lcos::future<void> all_slices_ready = slices_ready_promise.get_future(); */
+  /// How many slices can we expect?
+  const size_t number_slices;
+  const bool async_mode;
+
+  Executor &underlying_executor;
+
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+#pragma message                                                                \
+    "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..."
+  /// Stores the function call of the first slice as reference for error
+  /// checking
+  std::any function_tuple;
+  /// Stores the string of the first function call for debug output
+  std::string debug_type_information;
+  aggregation_mutex_t debug_mut;
+#endif
+
+  std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
+
+public:
+  aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec)
+      : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) {
+    if (async_mode)
+      potential_async_promises.resize(number_slices);
+  }
+  ~aggregated_function_call(void) {
+    // All slices should have done this call
+    assert(slice_counter == number_slices);
+    // assert(!all_slices_ready.valid());
+  }
+  /// Returns true if all required slices have visited this point
+  bool sync_aggregation_slices(hpx::lcos::future<void> &stream_future) {
+    assert(!async_mode);
+    assert(potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    if (local_counter == number_slices - 1) {
+      return true;
+    }
+    else return false;
+  }
+  template <typename F, typename... Ts>
+  void post_when(hpx::lcos::future<void> &stream_future, F &&f, Ts &&...ts) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+    // needed for concurrent access to function_tuple and debug_type_information
+    // Not required for normal use
+    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+#endif
+    assert(!async_mode);
+    assert(potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+
+    if (local_counter == 0) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto tmp_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      function_tuple = tmp_tuple;
+      debug_type_information = typeid(decltype(tmp_tuple)).name();
+#endif
+
+    } else {
+      //
+      // This scope checks if both the type and the values of the current call
+      // match the original call To be used in debug build...
+      //
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto comparison_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      try {
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        if (comparison_tuple != orig_call_tuple) {
+          throw std::runtime_error(
+              "Values of post function arguments (or function "
+              "itself) do not match ");
+        }
+      } catch (const std::bad_any_cast &e) {
+        hpx::cout
+            << "\nMismatched types error in aggregated post call of executor "
+            << ": " << e.what() << "\n";
+        hpx::cout << "Expected types:\t\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        hpx::cout << "\nGot types:\t\t "
+                  << boost::core::demangle(
+                         typeid(decltype(comparison_tuple)).name())
+                  << "\n"
+                  << std::endl;
+        // throw;
+      } catch (const std::runtime_error &e) {
+        hpx::cout
+            << "\nMismatched values error in aggregated post call of executor "
+            << ": " << e.what() << std::endl;
+        hpx::cout << "Types (matched):\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        hpx::cout << "\nExpected values:\t ";
+        print_tuple(orig_call_tuple);
+        hpx::cout << "\nGot values:\t\t ";
+        print_tuple(comparison_tuple);
+        hpx::cout << std::endl << std::endl;
+        // throw;
+      }
+#endif
+    }
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    // Check exit criteria: Launch function call continuation by setting the
+    // slices promise
+    if (local_counter == number_slices - 1) {
+      exec_post_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
+      //slices_ready_promise.set_value();
+    }
+  }
+  template <typename F, typename... Ts>
+  hpx::lcos::future<void> async_when(hpx::lcos::future<void> &stream_future,
+                                     F &&f, Ts &&...ts) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+    // needed for concurrent access to function_tuple and debug_type_information
+    // Not required for normal use
+    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+#endif
+    assert(async_mode);
+    assert(!potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    if (local_counter == 0) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto tmp_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      function_tuple = tmp_tuple;
+      debug_type_information = typeid(decltype(tmp_tuple)).name();
+#endif
+    } else {
+      //
+      // This scope checks if both the type and the values of the current call
+      // match the original call To be used in debug build...
+      //
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto comparison_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      try {
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        if (comparison_tuple != orig_call_tuple) {
+          throw std::runtime_error(
+              "Values of async function arguments (or function "
+              "itself) do not match ");
+        }
+      } catch (const std::bad_any_cast &e) {
+        hpx::cout
+            << "\nMismatched types error in aggregated async call of executor "
+            << ": " << e.what() << "\n";
+        hpx::cout << "Expected types:\t\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        hpx::cout << "\nGot types:\t\t "
+                  << boost::core::demangle(
+                         typeid(decltype(comparison_tuple)).name())
+                  << "\n"
+                  << std::endl;
+        // throw;
+      } catch (const std::runtime_error &e) {
+        hpx::cout
+            << "\nMismatched values error in aggregated async call of executor "
+            << ": " << e.what() << std::endl;
+        hpx::cout << "Types (matched):\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        hpx::cout << "\nExpected values:\t ";
+        print_tuple(orig_call_tuple);
+        hpx::cout << "\nGot values:\t\t ";
+        print_tuple(comparison_tuple);
+        hpx::cout << std::endl << std::endl;
+        // throw;
+      }
+#endif
+    }
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    assert(potential_async_promises.size() == number_slices);
+    hpx::lcos::future<void> ret_fut =
+        potential_async_promises[local_counter].get_future();
+    if (local_counter == number_slices - 1) {
+      /* slices_ready_promise.set_value(); */
+      auto fut = exec_async_wrapper<Executor, F, Ts...>(
+          underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
+      fut.then([this](auto &&fut) {
+        for (auto &promise : potential_async_promises) {
+          promise.set_value();
+        }
+      });
+    }
+    // Check exit criteria: Launch function call continuation by setting the
+    // slices promise
+    return ret_fut;
+  }
+  template <typename F, typename... Ts>
+  hpx::lcos::shared_future<void> wrap_async(hpx::lcos::future<void> &stream_future,
+                                     F &&f, Ts &&...ts) {
+    assert(async_mode);
+    assert(!potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    assert(potential_async_promises.size() == number_slices);
+    hpx::lcos::shared_future<void> ret_fut =
+        potential_async_promises[local_counter].get_shared_future();
+    if (local_counter == number_slices - 1) {
+      auto fut = f(std::forward<Ts>(ts)...);
+      fut.then([this](auto &&fut) {
+        // TODO just use one promise
+        for (auto &promise : potential_async_promises) {
+          promise.set_value();
+        }
+      });
+    }
+    return ret_fut;
+  }
+  // We need to be able to copy or no-except move for std::vector..
+  aggregated_function_call(const aggregated_function_call &other) = default;
+  aggregated_function_call &
+  operator=(const aggregated_function_call &other) = default;
+  aggregated_function_call(aggregated_function_call &&other) = default;
+  aggregated_function_call &
+  operator=(aggregated_function_call &&other) = default;
+};
+
+//===============================================================================
+//===============================================================================
+
+enum class aggregated_executor_modes { EAGER = 1, STRICT, ENDLESS };
+/// Declaration since the actual allocator is only defined after the Executors
+template <typename T, typename Host_Allocator, typename Executor>
+class allocator_slice;
+
+/// Executor Class that aggregates function calls for specific kernels
+/** Executor is not meant to be used directly. Instead it yields multiple
+ * executor_slice objects. These serve as interfaces. Slices from the same
+ * Aggregated_Executor are meant to execute the same function calls but on
+ * different data (i.e. different tasks)
+ */
+template <typename Executor> class Aggregated_Executor {
+private:
+  //===============================================================================
+  // Misc private avariables:
+  //
+  std::atomic<bool> slices_exhausted;
+
+  std::atomic<bool> executor_slices_alive;
+  std::atomic<bool> buffers_in_use;
+  std::atomic<size_t> dealloc_counter;
+
+  const aggregated_executor_modes mode;
+  const size_t max_slices;
+  std::atomic<size_t> current_slices;
+  /// Wrapper to the executor interface from the stream pool
+  /// Automatically hooks into the stream_pools reference counting
+  /// for cpu/gpu load balancing
+  std::unique_ptr<stream_interface<Executor, round_robin_pool<Executor>>> executor_wrapper;
+
+public:
+  size_t gpu_id;
+  // Subclasses
+
+  /// Slice class - meant as a scope interface to the aggregated executor
+  class executor_slice {
+  public:
+    Aggregated_Executor<Executor> &parent;
+  private:
+    /// Executor is a slice of this aggregated_executor
+    /// How many functions have been called - required to enforce sequential
+    /// behaviour of kernel launches
+    size_t launch_counter{0};
+    size_t buffer_counter{0};
+    bool notify_parent_about_destruction{true};
+
+  public:
+    /// How many slices are there overall - required to check the launch
+    /// criteria
+    const size_t number_slices;
+    const size_t id;
+    using executor_t = Executor;
+    executor_slice(Aggregated_Executor &parent, const size_t slice_id,
+                   const size_t number_slices)
+        : parent(parent), notify_parent_about_destruction(true),
+          number_slices(number_slices), id(slice_id) {
+  }
+    ~executor_slice(void) {
+      // Don't notify parent if we moved away from this executor_slice
+      if (notify_parent_about_destruction) {
+        // Executor should be done by the time of destruction
+        // -> check here before notifying parent
+
+        // parent still in execution mode?
+        assert(parent.slices_exhausted == true);
+        // all kernel launches done?
+        assert(launch_counter == parent.function_calls.size());
+        // Notifiy parent that this aggregation slice is one
+        parent.reduce_usage_counter();
+      }
+    }
+    executor_slice(const executor_slice &other) = delete;
+    executor_slice &operator=(const executor_slice &other) = delete;
+    executor_slice(executor_slice &&other)
+        : parent(other.parent), launch_counter(std::move(other.launch_counter)),
+          buffer_counter(std::move(other.buffer_counter)),
+          number_slices(std::move(other.number_slices)),
+          id(std::move(other.id)) {
+      other.notify_parent_about_destruction = false;
+    }
+    executor_slice &operator=(executor_slice &&other) {
+      parent = other.parent;
+      launch_counter = std::move(other.launch_counter);
+      buffer_counter = std::move(other.buffer_counter);
+      number_slices = std::move(other.number_slices);
+      id = std::move(other.id);
+      other.notify_parent_about_destruction = false;
+    }
+    template <typename T, typename Host_Allocator>
+    allocator_slice<T, Host_Allocator, Executor> make_allocator() {
+      return allocator_slice<T, Host_Allocator, Executor>(*this);
+    }
+    bool sync_aggregation_slices() {
+      assert(parent.slices_exhausted == true);
+      auto ret = parent.sync_aggregation_slices(launch_counter);
+      launch_counter++;
+      return ret;
+    }
+    template <typename F, typename... Ts> void post(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      parent.post(launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+    }
+    template <typename F, typename... Ts>
+    hpx::lcos::future<void> async(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      hpx::lcos::future<void> ret_fut = parent.async(
+          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+      return ret_fut;
+    }
+
+    // OneWay Execution
+    template <typename F, typename... Ts>
+    friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t,
+        executor_slice& exec, F&& f, Ts&&... ts)
+    {
+        return exec.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+    }
+
+    // TwoWay Execution
+    template <typename F, typename... Ts>
+    friend decltype(auto) tag_invoke(
+        hpx::parallel::execution::async_execute_t, executor_slice& exec,
+        F&& f, Ts&&... ts)
+    {
+        return exec.async(
+            std::forward<F>(f), std::forward<Ts>(ts)...);
+    }
+
+    template <typename F, typename... Ts>
+    hpx::lcos::shared_future<void> wrap_async(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      hpx::lcos::shared_future<void> ret_fut = parent.wrap_async(
+          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+      return ret_fut;
+    }
+
+    /// Get new aggregated buffer (might have already been allocated been
+    /// allocated by different slice)
+    template <typename T, typename Host_Allocator> T *get(const size_t size) {
+      assert(parent.slices_exhausted == true);
+      T *aggregated_buffer =
+          parent.get<T, Host_Allocator>(size, buffer_counter);
+      buffer_counter++;
+      assert(buffer_counter > 0);
+      return aggregated_buffer;
+    }
+
+    Executor& get_underlying_executor(void) {
+      assert(parent.executor_wrapper);
+      return *(parent.executor_wrapper);
+    }
+  };
+
+  // deprecated name...
+  /* using Executor_Slice = executor_slice; */
+
+  //===============================================================================
+
+  hpx::lcos::local::promise<void> slices_full_promise;
+  /// Promises with the slice executors -- to be set when the starting criteria
+  /// is met
+  std::vector<hpx::lcos::local::promise<executor_slice>> executor_slices;
+  /// List of aggregated function calls - function will be launched when all
+  /// slices have called it
+  std::deque<aggregated_function_call<Executor>> function_calls;
+  /// For synchronizing the access to the function calls list
+  aggregation_mutex_t mut;
+
+  /// Data entry for a buffer allocation: void* pointer, size_t for
+  /// buffer-size, atomic for the slice counter, location_id, gpu_id
+  using buffer_entry_t =
+      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>;
+  /// Keeps track of the aggregated buffer allocations done in all the slices
+  std::deque<buffer_entry_t> buffer_allocations;
+  /// Map pointer to deque index for fast access in the deallocations
+  std::unordered_map<void*,size_t> buffer_allocations_map;
+  /// For synchronizing the access to the buffer_allocations
+  aggregation_mutex_t buffer_mut;
+  std::atomic<size_t> buffer_counter = 0;
+
+  /// Get new buffer OR get buffer already allocated by different slice
+  template <typename T, typename Host_Allocator>
+  T *get(const size_t size, const size_t slice_alloc_counter) {
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    assert(executor_slices_alive == true);
+    // Add aggreated buffer entry in case it hasn't happened yet for this call
+    // First: Check if it already has happened
+    if (buffer_counter <= slice_alloc_counter) {
+      // we might be the first! Lock...
+      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      // ... and recheck
+      if (buffer_counter <= slice_alloc_counter) {
+        constexpr bool manage_content_lifetime = false;
+        buffers_in_use = true;
+
+        // Default location -- useful for GPU builds as we otherwise create way too
+        // many different buffers for different aggregation sizes on different GPUs
+        /* size_t location_id = gpu_id * instances_per_gpu; */
+        // Use integer conversion to only use 0 16 32 ... as buckets
+        size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; 
+#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+        if (max_slices == 1) {
+          // get prefered location: aka the current hpx threads location
+          // Usually handy for CPU builds where we want to use the buffers
+          // close to the current CPU core
+          /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
+          /* location_id = (gpu_id) * instances_per_gpu; */
+          // division makes sure that we always use the same instance to store our gpu buffers.
+        }
+#endif
+        // Get shiny and new buffer that will be shared between all slices
+        // Buffer might be recycled from previous allocations by the
+        // buffer_interface...
+        T *aggregated_buffer =
+            cppuddle::memory_recycling::detail::buffer_interface::get<
+                T, Host_Allocator>(size, manage_content_lifetime, location_id,
+                                   gpu_id);
+        // Create buffer entry for this buffer
+        buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
+                                        size, 1, true, location_id, gpu_id);
+
+#ifndef NDEBUG
+        // if previousely used the buffer should not be in usage anymore
+        const auto exists = buffer_allocations_map.count(
+            static_cast<void *>(aggregated_buffer));
+        if (exists > 0) {
+          const auto previous_usage_id =
+              buffer_allocations_map[static_cast<void *>(aggregated_buffer)];
+          const auto &valid =
+              std::get<3>(buffer_allocations[previous_usage_id]);
+          assert(!valid);
+        }
+#endif
+        buffer_allocations_map.insert_or_assign(static_cast<void *>(aggregated_buffer),
+            buffer_counter);
+
+        assert (buffer_counter == slice_alloc_counter);
+        buffer_counter = buffer_allocations.size();
+
+        // Return buffer
+        return aggregated_buffer;
+      }
+    }
+    assert(buffers_in_use == true);
+    assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid
+    assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1);
+
+    // Buffer entry should already exist:
+    T *aggregated_buffer = static_cast<T *>(
+        std::get<0>(buffer_allocations[slice_alloc_counter]));
+    // Error handling: Size is wrong?
+    assert(size == std::get<1>(buffer_allocations[slice_alloc_counter]));
+    // Notify that one more slice has visited this buffer allocation
+    std::get<2>(buffer_allocations[slice_alloc_counter])++;
+    return aggregated_buffer;
+  }
+
+  /// Notify buffer list that one slice is done with the buffer
+  template <typename T, typename Host_Allocator>
+  void mark_unused(T *p, const size_t size) {
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+
+    void *ptr_key = static_cast<void*>(p);
+    size_t slice_alloc_counter = buffer_allocations_map[p];
+
+    assert(slice_alloc_counter < buffer_allocations.size());
+    /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] =
+        buffer_allocations[slice_alloc_counter];*/
+    auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]);
+    const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]);
+    auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]);
+    auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]);
+    const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]);
+    const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]);
+    assert(valid);
+    T *buffer_pointer = static_cast<T *>(buffer_pointer_void);
+
+    assert(buffer_size == size);
+    assert(p == buffer_pointer);
+    // assert(buffer_pointer == p || buffer_pointer == nullptr);
+    // Slice is done with this buffer
+    buffer_allocation_counter--;
+    // Check if all slices are done with this buffer?
+    if (buffer_allocation_counter == 0) {
+      // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
+      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      // Only mark unused if another buffer has not done so already (and marked
+      // it as invalid)
+      if (valid) {
+        assert(buffers_in_use == true);
+        cppuddle::memory_recycling::detail::buffer_interface::mark_unused<
+            T, Host_Allocator>(buffer_pointer, buffer_size, location_id,
+                               gpu_id);
+        // mark buffer as invalid to prevent any other slice from marking the
+        // buffer as unused
+        valid = false;
+
+        const size_t current_deallocs = ++dealloc_counter;
+        if (current_deallocs == buffer_counter) {
+          std::lock_guard<aggregation_mutex_t> guard(mut);
+          buffers_in_use = false;
+          if (!executor_slices_alive && !buffers_in_use) {
+            slices_exhausted = false;
+            // Release executor
+            executor_wrapper.reset(nullptr);
+          }
+        }
+      }
+    }
+  }
+
+  //===============================================================================
+  // Public Interface
+public:
+  hpx::lcos::future<void> current_continuation;
+  hpx::lcos::future<void> last_stream_launch_done;
+  std::atomic<size_t> overall_launch_counter = 0;
+
+  /// Only meant to be accessed by the slice executors
+  bool sync_aggregation_slices(const size_t slice_launch_counter) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, false, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].sync_aggregation_slices(
+            last_stream_launch_done);
+      }
+    }
+
+    return function_calls[slice_launch_counter].sync_aggregation_slices(
+        last_stream_launch_done);
+  }
+
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, false, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        function_calls[slice_launch_counter].post_when(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+        return;
+      }
+    }
+
+    function_calls[slice_launch_counter].post_when(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+    return;
+  }
+
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
+                                Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, true, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].async_when(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+      }
+    }
+
+    return function_calls[slice_launch_counter].async_when(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
+                                Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, true, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].wrap_async(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+      }
+    }
+
+    return function_calls[slice_launch_counter].wrap_async(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  bool slice_available(void) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    return !slices_exhausted;
+  }
+
+  std::optional<hpx::lcos::future<executor_slice>> request_executor_slice() {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    if (!slices_exhausted) {
+      const size_t local_slice_id = ++current_slices;
+      if (local_slice_id == 1) {
+        // Cleanup leftovers from last run if any
+        // TODO still required? Should be clean here already
+        function_calls.clear();
+        overall_launch_counter = 0;
+        std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+#ifndef NDEBUG
+        for (const auto &buffer_entry : buffer_allocations) {
+          const auto &[buffer_pointer_any, buffer_size,
+                       buffer_allocation_counter, valid, location_id, device_id] =
+              buffer_entry;
+          assert(!valid);
+        }
+#endif 
+        buffer_allocations.clear();
+        buffer_allocations_map.clear();
+        buffer_counter = 0;
+
+        assert(executor_slices_alive == false);
+        assert(buffers_in_use == false);
+        executor_slices_alive = true;
+        buffers_in_use = false;
+        dealloc_counter = 0;
+
+        if (mode == aggregated_executor_modes::STRICT ) {
+          slices_full_promise = hpx::lcos::local::promise<void>{};
+        }
+      }
+
+      // Create Executor Slice future -- that will be returned later
+      hpx::lcos::future<executor_slice> ret_fut;
+      if (local_slice_id < max_slices) {
+        executor_slices.emplace_back(hpx::lcos::local::promise<executor_slice>{});
+        ret_fut =
+            executor_slices[local_slice_id - 1].get_future();
+      } else {
+        launched_slices = current_slices;
+        ret_fut = hpx::make_ready_future(executor_slice{*this,
+            executor_slices.size(), launched_slices});
+      }
+
+      // Are we the first slice? If yes, add continuation set the
+      // executor_slice
+      // futures to ready if the launch conditions are met
+      if (local_slice_id == 1) {
+        // Redraw executor
+        assert(!executor_wrapper);
+        stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
+        executor_wrapper.reset(
+            new stream_interface<Executor, round_robin_pool<Executor>>(gpu_id));
+        // Renew promise that all slices will be ready as the primary launch
+        // criteria...
+        hpx::lcos::shared_future<void> fut;
+        if (mode == aggregated_executor_modes::EAGER ||
+            mode == aggregated_executor_modes::ENDLESS) {
+          // Fallback launch condidtion: Launch as soon as the underlying stream
+          // is ready
+          /* auto slices_full_fut = slices_full_promise.get_future(); */
+          stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
+          auto exec_fut = (*executor_wrapper).get_future(); 
+          /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */
+          fut = std::move(exec_fut);
+        } else {
+          auto slices_full_fut = slices_full_promise.get_shared_future();
+          // Just use the slices launch condition
+          fut = std::move(slices_full_fut);
+        }
+        // Launch all executor slices within this continuation
+        current_continuation = fut.then([this](auto &&fut) {
+          std::lock_guard<aggregation_mutex_t> guard(mut);
+          slices_exhausted = true;
+          launched_slices = current_slices;
+          size_t id = 0;
+          for (auto &slice_promise : executor_slices) {
+            slice_promise.set_value(
+                executor_slice{*this, id, launched_slices});
+            id++;
+          }
+          executor_slices.clear();
+        });
+      }
+      if (local_slice_id >= max_slices &&
+          mode != aggregated_executor_modes::ENDLESS) {
+        slices_exhausted = true; // prevents any more threads from entering
+                                 // before the continuation is launched
+        /* launched_slices = current_slices; */
+        /* size_t id = 0; */
+        /* for (auto &slice_promise : executor_slices) { */
+        /*   slice_promise.set_value( */
+        /*       executor_slice{*this, id, launched_slices}); */
+        /*   id++; */
+        /* } */
+        /* executor_slices.clear(); */
+        if (mode == aggregated_executor_modes::STRICT ) {
+          slices_full_promise.set_value(); // Trigger slices launch condition continuation 
+        }
+        // that continuation will set all executor slices so far handed out to ready
+      }
+      return ret_fut;
+    } else {
+      // Return empty optional as failure
+      return std::optional<hpx::lcos::future<executor_slice>>{};
+    }
+  }
+  size_t launched_slices;
+  void reduce_usage_counter(void) {
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    assert(executor_slices_alive == true);
+    assert(launched_slices >= 1);
+    assert(current_slices >= 0 && current_slices <= launched_slices);
+    const size_t local_slice_id = --current_slices;
+    // Last slice goes out scope?
+    if (local_slice_id == 0) {
+      // Mark executor fit for reusage
+      std::lock_guard<aggregation_mutex_t> guard(mut);
+      executor_slices_alive = false; 
+      if (!executor_slices_alive && !buffers_in_use) {
+        // Release executor
+        slices_exhausted = false;
+        executor_wrapper.reset(nullptr);
+      }
+    }
+  }
+  ~Aggregated_Executor(void) {
+
+    assert(current_slices == 0);
+    assert(executor_slices_alive == false);
+    assert(buffers_in_use == false);
+
+    if (mode != aggregated_executor_modes::STRICT ) {
+        slices_full_promise.set_value(); // Trigger slices launch condition continuation 
+    }
+
+    // Cleanup leftovers from last run if any
+    function_calls.clear();
+    overall_launch_counter = 0;
+#ifndef NDEBUG
+    for (const auto &buffer_entry : buffer_allocations) {
+      const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter,
+                   valid, location_id, device_id] = buffer_entry;
+      assert(!valid);
+    }
+#endif
+    buffer_allocations.clear();
+    buffer_allocations_map.clear();
+    buffer_counter = 0;
+
+    assert(buffer_allocations.empty());
+    assert(buffer_allocations_map.empty());
+  }
+
+  Aggregated_Executor(const size_t number_slices,
+                      aggregated_executor_modes mode, const size_t gpu_id = 0)
+      : max_slices(number_slices), current_slices(0), slices_exhausted(false),
+        dealloc_counter(0), mode(mode), executor_slices_alive(false),
+        buffers_in_use(false), gpu_id(gpu_id),
+        executor_wrapper(nullptr),
+        current_continuation(hpx::make_ready_future()),
+        last_stream_launch_done(hpx::make_ready_future()) {}
+  // Not meant to be copied or moved
+  Aggregated_Executor(const Aggregated_Executor &other) = delete;
+  Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete;
+  Aggregated_Executor(Aggregated_Executor &&other) = delete;
+  Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete;
+};
+
+template <typename T, typename Host_Allocator, typename Executor>
+class allocator_slice {
+private:
+  typename Aggregated_Executor<Executor>::executor_slice &executor_reference;
+  Aggregated_Executor<Executor> &executor_parent;
+
+public:
+  using value_type = T;
+  allocator_slice(
+      typename Aggregated_Executor<Executor>::executor_slice &executor)
+      : executor_reference(executor), executor_parent(executor.parent) {}
+  template <typename U>
+  explicit allocator_slice(
+      allocator_slice<U, Host_Allocator, Executor> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data = executor_reference.template get<T, Host_Allocator>(n);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    /* executor_reference.template mark_unused<T, Host_Allocator>(p, n); */
+    executor_parent.template mark_unused<T, Host_Allocator>(p, n);
+  }
+  template <typename... Args>
+  inline void construct(T *p, Args... args) noexcept {
+    // Do nothing here - we reuse the content of the last owner
+  }
+  void destroy(T *p) {
+    // Do nothing here - Contents will be destroyed when the buffer manager is
+    // destroyed, not before
+  }
+};
+template <typename T, typename U, typename Host_Allocator, typename Executor>
+constexpr bool
+operator==(allocator_slice<T, Host_Allocator, Executor> const &,
+           allocator_slice<U, Host_Allocator, Executor> const &) noexcept {
+  return false;
+}
+template <typename T, typename U, typename Host_Allocator, typename Executor>
+constexpr bool
+operator!=(allocator_slice<T, Host_Allocator, Executor> const &,
+           allocator_slice<U, Host_Allocator, Executor> const &) noexcept {
+  return true;
+}
+
+//===============================================================================
+//===============================================================================
+// Pool Strategy:
+
+template <const char *kernelname, class Interface, class Pool>
+class aggregation_pool {
+public:
+  /// interface
+  template <typename... Ts>
+  static void init(size_t number_of_executors, size_t slices_per_executor,
+                   aggregated_executor_modes mode, size_t num_devices = 1) {
+    if (is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to initialize cppuddle aggregation pool twice") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    if (num_devices > cppuddle::max_number_gpus) {
+      throw std::runtime_error(
+          std::string(
+              "Trying to initialize aggregation with more devices than the "
+              "maximum number of GPUs given at compiletime") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    number_devices = num_devices;
+    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
+
+      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      assert(instance()[gpu_id].aggregation_executor_pool.empty());
+      for (int i = 0; i < number_of_executors; i++) {
+        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
+                                                        mode, gpu_id);
+      }
+      instance()[gpu_id].slices_per_executor = slices_per_executor;
+      instance()[gpu_id].mode = mode;
+    }
+    is_initialized = true;
+  }
+
+  /// Will always return a valid executor slice
+  static decltype(auto) request_executor_slice(void) {
+    if (!is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to use cppuddle aggregation pool without first calling init") +
+          " Agg poolname: " + std::string(kernelname));
+    }
+    const size_t gpu_id = cppuddle::get_device_id(number_devices);
+    /* const size_t gpu_id = 1; */
+    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
+    std::optional<hpx::lcos::future<
+        typename Aggregated_Executor<Interface>::executor_slice>>
+        ret;
+    size_t local_id = (instance()[gpu_id].current_interface) %
+                      instance()[gpu_id].aggregation_executor_pool.size();
+    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+    // Expected case: current aggregation executor is free
+    if (ret.has_value()) {
+      return ret;
+    }
+    // current interface is bad -> find free one
+    size_t abort_counter = 0;
+    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
+    do {
+      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
+                 instance()[gpu_id].aggregation_executor_pool.size();
+      ret =
+          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+      if (ret.has_value()) {
+        return ret;
+      }
+      abort_counter++;
+    } while (abort_counter <= abort_number);
+    // Everything's busy -> create new aggregation executor (growing pool) OR
+    // return empty optional
+    if (instance()[gpu_id].growing_pool) {
+      instance()[gpu_id].aggregation_executor_pool.emplace_back(
+          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
+      instance()[gpu_id].current_interface =
+          instance()[gpu_id].aggregation_executor_pool.size() - 1;
+      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
+      ret = instance()[gpu_id]
+                .aggregation_executor_pool[instance()[gpu_id].current_interface]
+                .request_executor_slice();
+      assert(ret.has_value()); // fresh executor -- should always have slices
+                               // available
+    }
+    return ret;
+  }
+
+private:
+  std::deque<Aggregated_Executor<Interface>> aggregation_executor_pool;
+  std::atomic<size_t> current_interface{0};
+  size_t slices_per_executor;
+  aggregated_executor_modes mode;
+  bool growing_pool{true};
+
+private:
+  /// Required for dealing with adding elements to the deque of
+  /// aggregated_executors
+  aggregation_mutex_t pool_mutex;
+  /// Global access instance
+  static std::unique_ptr<aggregation_pool[]>& instance(void) {
+    static std::unique_ptr<aggregation_pool[]> pool_instances{
+        new aggregation_pool[cppuddle::max_number_gpus]};
+    return pool_instances;
+  }
+  static inline size_t number_devices = 1;
+  static inline bool is_initialized = false;
+  aggregation_pool() = default;
+
+public:
+  ~aggregation_pool() = default;
+  // Bunch of constructors we don't need
+  aggregation_pool(aggregation_pool const &other) = delete;
+  aggregation_pool &operator=(aggregation_pool const &other) = delete;
+  aggregation_pool(aggregation_pool &&other) = delete;
+  aggregation_pool &operator=(aggregation_pool &&other) = delete;
+};
+
+} // namespace kernel_aggregation
+} // namespace cppuddle
+
+namespace hpx { namespace parallel { namespace execution {
+   // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with 
+   // compatible unlying executor types
+    /* template<typename E> */
+    /* struct is_one_way_executor<typename Aggregated_Executor<E>::executor_slice> */
+    /*   : std::true_type */
+    /* {}; */
+    /* template<typename E> */
+    /* struct is_two_way_executor<typename Aggregated_Executor<E>::executor_slice> */
+    /*   : std::true_type */
+    /* {}; */
+
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+    // Workaround for the meantime: Manually create traits for compatible types:
+template <>
+struct is_one_way_executor<
+    typename cppuddle::kernel_aggregation::Aggregated_Executor<
+        hpx::cuda::experimental::cuda_executor>::executor_slice>
+    : std::true_type {};
+template <>
+struct is_two_way_executor<
+    typename cppuddle::kernel_aggregation::Aggregated_Executor<
+        hpx::cuda::experimental::cuda_executor>::executor_slice>
+    : std::true_type {};
+#endif
+}}}
+
+#endif

From 54373c3e7d56366080850cad0e1f23d3dae4ec07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Fri, 8 Mar 2024 22:45:02 +0100
Subject: [PATCH 11/19] Work aggregation interface refactoring

---
 include/aggregation_manager.hpp               |  22 ++-
 .../detail/aggregation_executor_pools.hpp     | 134 ++++++++++++++
 .../aggregation_executors_and_allocators.hpp} | 163 +++---------------
 .../kernel_aggregation_interface.hpp          |  34 ++++
 tests/work_aggregation_cpu_triad.cpp          |  16 +-
 5 files changed, 215 insertions(+), 154 deletions(-)
 create mode 100644 include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp
 rename include/cppuddle/kernel_aggregation/{kernel_aggregation_management.hpp => detail/aggregation_executors_and_allocators.hpp} (87%)
 create mode 100644 include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 030150f9..bb0fd83f 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -6,21 +6,29 @@
 #ifndef AGGREGATION_MANAGER_HPP
 #define AGGREGATION_MANAGER_HPP
 
-#include "cppuddle/kernel_aggregation/kernel_aggregation_management.hpp"
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
-using Aggregated_Executor_Modes =
-    cppuddle::kernel_aggregation::aggregated_executor_modes;
+using Aggregated_Executor_Modes
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor_modes "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+        cppuddle::kernel_aggregation::aggregated_executor_modes;
 
 template <typename T, typename Host_Allocator, typename Executor>
-using Allocator_Slice =
+using Allocator_Slice
+    [[deprecated("Use cppuddle::kernel_aggregation::allocator_slice "
+                 "from kernel_aggregation_interface.hpp instead")]] =
     cppuddle::kernel_aggregation::allocator_slice<T, Host_Allocator, Executor>;
 
 template <typename Executor>
-using Aggregated_Executor =
-    cppuddle::kernel_aggregation::Aggregated_Executor<Executor>;
+using Aggregated_Executor
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+    cppuddle::kernel_aggregation::aggregated_executor<Executor>;
 
 template <const char *kernelname, class Interface, class Pool>
-using aggregation_pool =
+using aggregation_pool
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregation_pool "
+                 "from kernel_aggregation_interface.hpp instead")]] =
     cppuddle::kernel_aggregation::aggregation_pool<kernelname, Interface,
     Pool>;
 
diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp
new file mode 100644
index 00000000..b9d456cc
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2022-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp"
+
+#ifndef AGGREGATION_EXECUTOR_POOL_HPP
+#define AGGREGATION_EXECUTOR_POOL_HPP
+
+namespace cppuddle {
+namespace kernel_aggregation {
+namespace detail {
+
+template <const char *kernelname, class Interface, class Pool>
+class aggregation_pool {
+public:
+  /// interface
+  template <typename... Ts>
+  static void init(size_t number_of_executors, size_t slices_per_executor,
+                   aggregated_executor_modes mode, size_t num_devices = 1) {
+    if (is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to initialize cppuddle aggregation pool twice") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    if (num_devices > cppuddle::max_number_gpus) {
+      throw std::runtime_error(
+          std::string(
+              "Trying to initialize aggregation with more devices than the "
+              "maximum number of GPUs given at compiletime") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    number_devices = num_devices;
+    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
+
+      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      assert(instance()[gpu_id].aggregation_executor_pool.empty());
+      for (int i = 0; i < number_of_executors; i++) {
+        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
+                                                        mode, gpu_id);
+      }
+      instance()[gpu_id].slices_per_executor = slices_per_executor;
+      instance()[gpu_id].mode = mode;
+    }
+    is_initialized = true;
+  }
+
+  /// Will always return a valid executor slice
+  static decltype(auto) request_executor_slice(void) {
+    if (!is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to use cppuddle aggregation pool without first calling init") +
+          " Agg poolname: " + std::string(kernelname));
+    }
+    const size_t gpu_id = cppuddle::get_device_id(number_devices);
+    /* const size_t gpu_id = 1; */
+    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
+    std::optional<hpx::lcos::future<
+        typename aggregated_executor<Interface>::executor_slice>>
+        ret;
+    size_t local_id = (instance()[gpu_id].current_interface) %
+                      instance()[gpu_id].aggregation_executor_pool.size();
+    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+    // Expected case: current aggregation executor is free
+    if (ret.has_value()) {
+      return ret;
+    }
+    // current interface is bad -> find free one
+    size_t abort_counter = 0;
+    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
+    do {
+      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
+                 instance()[gpu_id].aggregation_executor_pool.size();
+      ret =
+          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+      if (ret.has_value()) {
+        return ret;
+      }
+      abort_counter++;
+    } while (abort_counter <= abort_number);
+    // Everything's busy -> create new aggregation executor (growing pool) OR
+    // return empty optional
+    if (instance()[gpu_id].growing_pool) {
+      instance()[gpu_id].aggregation_executor_pool.emplace_back(
+          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
+      instance()[gpu_id].current_interface =
+          instance()[gpu_id].aggregation_executor_pool.size() - 1;
+      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
+      ret = instance()[gpu_id]
+                .aggregation_executor_pool[instance()[gpu_id].current_interface]
+                .request_executor_slice();
+      assert(ret.has_value()); // fresh executor -- should always have slices
+                               // available
+    }
+    return ret;
+  }
+
+private:
+  std::deque<aggregated_executor<Interface>> aggregation_executor_pool;
+  std::atomic<size_t> current_interface{0};
+  size_t slices_per_executor;
+  aggregated_executor_modes mode;
+  bool growing_pool{true};
+
+private:
+  /// Required for dealing with adding elements to the deque of
+  /// aggregated_executors
+  aggregation_mutex_t pool_mutex;
+  /// Global access instance
+  static std::unique_ptr<aggregation_pool[]>& instance(void) {
+    static std::unique_ptr<aggregation_pool[]> pool_instances{
+        new aggregation_pool[cppuddle::max_number_gpus]};
+    return pool_instances;
+  }
+  static inline size_t number_devices = 1;
+  static inline bool is_initialized = false;
+  aggregation_pool() = default;
+
+public:
+  ~aggregation_pool() = default;
+  // Bunch of constructors we don't need
+  aggregation_pool(aggregation_pool const &other) = delete;
+  aggregation_pool &operator=(aggregation_pool const &other) = delete;
+  aggregation_pool(aggregation_pool &&other) = delete;
+  aggregation_pool &operator=(aggregation_pool &&other) = delete;
+};
+
+} // namespace detail
+} // namespace kernel_aggregation
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
similarity index 87%
rename from include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp
rename to include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
index fd5a8e77..5826c2c3 100644
--- a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef KERNEL_AGGREGATION_MANAGEMENT_HPP
-#define KERNEL_AGGREGATION_MANAGEMENT_HPP
+#ifndef AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP
+#define AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP
 
 #ifndef CPPUDDLE_HAVE_HPX
 #error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON"
@@ -60,6 +60,7 @@
 #endif
 namespace cppuddle {
 namespace kernel_aggregation {
+namespace detail {
   using aggregation_mutex_t = hpx::mutex;
 
 //===============================================================================
@@ -381,10 +382,10 @@ class allocator_slice;
 /// Executor Class that aggregates function calls for specific kernels
 /** Executor is not meant to be used directly. Instead it yields multiple
  * executor_slice objects. These serve as interfaces. Slices from the same
- * Aggregated_Executor are meant to execute the same function calls but on
+ * aggregated_executor are meant to execute the same function calls but on
  * different data (i.e. different tasks)
  */
-template <typename Executor> class Aggregated_Executor {
+template <typename Executor> class aggregated_executor {
 private:
   //===============================================================================
   // Misc private avariables:
@@ -410,7 +411,7 @@ template <typename Executor> class Aggregated_Executor {
   /// Slice class - meant as a scope interface to the aggregated executor
   class executor_slice {
   public:
-    Aggregated_Executor<Executor> &parent;
+    aggregated_executor<Executor> &parent;
   private:
     /// Executor is a slice of this aggregated_executor
     /// How many functions have been called - required to enforce sequential
@@ -425,7 +426,7 @@ template <typename Executor> class Aggregated_Executor {
     const size_t number_slices;
     const size_t id;
     using executor_t = Executor;
-    executor_slice(Aggregated_Executor &parent, const size_t slice_id,
+    executor_slice(aggregated_executor &parent, const size_t slice_id,
                    const size_t number_slices)
         : parent(parent), notify_parent_about_destruction(true),
           number_slices(number_slices), id(slice_id) {
@@ -536,7 +537,7 @@ template <typename Executor> class Aggregated_Executor {
   };
 
   // deprecated name...
-  /* using Executor_Slice = executor_slice; */
+  using Executor_Slice [[deprectated("Renamed: Use executor_slice instead")]] = executor_slice;
 
   //===============================================================================
 
@@ -922,7 +923,7 @@ template <typename Executor> class Aggregated_Executor {
       }
     }
   }
-  ~Aggregated_Executor(void) {
+  ~aggregated_executor(void) {
 
     assert(current_slices == 0);
     assert(executor_slices_alive == false);
@@ -950,7 +951,7 @@ template <typename Executor> class Aggregated_Executor {
     assert(buffer_allocations_map.empty());
   }
 
-  Aggregated_Executor(const size_t number_slices,
+  aggregated_executor(const size_t number_slices,
                       aggregated_executor_modes mode, const size_t gpu_id = 0)
       : max_slices(number_slices), current_slices(0), slices_exhausted(false),
         dealloc_counter(0), mode(mode), executor_slices_alive(false),
@@ -959,22 +960,22 @@ template <typename Executor> class Aggregated_Executor {
         current_continuation(hpx::make_ready_future()),
         last_stream_launch_done(hpx::make_ready_future()) {}
   // Not meant to be copied or moved
-  Aggregated_Executor(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor(Aggregated_Executor &&other) = delete;
-  Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete;
+  aggregated_executor(const aggregated_executor &other) = delete;
+  aggregated_executor &operator=(const aggregated_executor &other) = delete;
+  aggregated_executor(aggregated_executor &&other) = delete;
+  aggregated_executor &operator=(aggregated_executor &&other) = delete;
 };
 
 template <typename T, typename Host_Allocator, typename Executor>
 class allocator_slice {
 private:
-  typename Aggregated_Executor<Executor>::executor_slice &executor_reference;
-  Aggregated_Executor<Executor> &executor_parent;
+  typename aggregated_executor<Executor>::executor_slice &executor_reference;
+  aggregated_executor<Executor> &executor_parent;
 
 public:
   using value_type = T;
   allocator_slice(
-      typename Aggregated_Executor<Executor>::executor_slice &executor)
+      typename aggregated_executor<Executor>::executor_slice &executor)
       : executor_reference(executor), executor_parent(executor.parent) {}
   template <typename U>
   explicit allocator_slice(
@@ -1009,137 +1010,21 @@ operator!=(allocator_slice<T, Host_Allocator, Executor> const &,
   return true;
 }
 
-//===============================================================================
-//===============================================================================
-// Pool Strategy:
-
-template <const char *kernelname, class Interface, class Pool>
-class aggregation_pool {
-public:
-  /// interface
-  template <typename... Ts>
-  static void init(size_t number_of_executors, size_t slices_per_executor,
-                   aggregated_executor_modes mode, size_t num_devices = 1) {
-    if (is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to initialize cppuddle aggregation pool twice") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    if (num_devices > cppuddle::max_number_gpus) {
-      throw std::runtime_error(
-          std::string(
-              "Trying to initialize aggregation with more devices than the "
-              "maximum number of GPUs given at compiletime") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    number_devices = num_devices;
-    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
-
-      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-      assert(instance()[gpu_id].aggregation_executor_pool.empty());
-      for (int i = 0; i < number_of_executors; i++) {
-        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
-                                                        mode, gpu_id);
-      }
-      instance()[gpu_id].slices_per_executor = slices_per_executor;
-      instance()[gpu_id].mode = mode;
-    }
-    is_initialized = true;
-  }
-
-  /// Will always return a valid executor slice
-  static decltype(auto) request_executor_slice(void) {
-    if (!is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to use cppuddle aggregation pool without first calling init") +
-          " Agg poolname: " + std::string(kernelname));
-    }
-    const size_t gpu_id = cppuddle::get_device_id(number_devices);
-    /* const size_t gpu_id = 1; */
-    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
-    std::optional<hpx::lcos::future<
-        typename Aggregated_Executor<Interface>::executor_slice>>
-        ret;
-    size_t local_id = (instance()[gpu_id].current_interface) %
-                      instance()[gpu_id].aggregation_executor_pool.size();
-    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-    // Expected case: current aggregation executor is free
-    if (ret.has_value()) {
-      return ret;
-    }
-    // current interface is bad -> find free one
-    size_t abort_counter = 0;
-    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
-    do {
-      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
-                 instance()[gpu_id].aggregation_executor_pool.size();
-      ret =
-          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-      if (ret.has_value()) {
-        return ret;
-      }
-      abort_counter++;
-    } while (abort_counter <= abort_number);
-    // Everything's busy -> create new aggregation executor (growing pool) OR
-    // return empty optional
-    if (instance()[gpu_id].growing_pool) {
-      instance()[gpu_id].aggregation_executor_pool.emplace_back(
-          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
-      instance()[gpu_id].current_interface =
-          instance()[gpu_id].aggregation_executor_pool.size() - 1;
-      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
-      ret = instance()[gpu_id]
-                .aggregation_executor_pool[instance()[gpu_id].current_interface]
-                .request_executor_slice();
-      assert(ret.has_value()); // fresh executor -- should always have slices
-                               // available
-    }
-    return ret;
-  }
-
-private:
-  std::deque<Aggregated_Executor<Interface>> aggregation_executor_pool;
-  std::atomic<size_t> current_interface{0};
-  size_t slices_per_executor;
-  aggregated_executor_modes mode;
-  bool growing_pool{true};
-
-private:
-  /// Required for dealing with adding elements to the deque of
-  /// aggregated_executors
-  aggregation_mutex_t pool_mutex;
-  /// Global access instance
-  static std::unique_ptr<aggregation_pool[]>& instance(void) {
-    static std::unique_ptr<aggregation_pool[]> pool_instances{
-        new aggregation_pool[cppuddle::max_number_gpus]};
-    return pool_instances;
-  }
-  static inline size_t number_devices = 1;
-  static inline bool is_initialized = false;
-  aggregation_pool() = default;
-
-public:
-  ~aggregation_pool() = default;
-  // Bunch of constructors we don't need
-  aggregation_pool(aggregation_pool const &other) = delete;
-  aggregation_pool &operator=(aggregation_pool const &other) = delete;
-  aggregation_pool(aggregation_pool &&other) = delete;
-  aggregation_pool &operator=(aggregation_pool &&other) = delete;
-};
-
+} // namespace detail
 } // namespace kernel_aggregation
 } // namespace cppuddle
 
+
+
 namespace hpx { namespace parallel { namespace execution {
    // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with 
    // compatible unlying executor types
     /* template<typename E> */
-    /* struct is_one_way_executor<typename Aggregated_Executor<E>::executor_slice> */
+    /* struct is_one_way_executor<typename aggregated_executor<E>::executor_slice> */
     /*   : std::true_type */
     /* {}; */
     /* template<typename E> */
-    /* struct is_two_way_executor<typename Aggregated_Executor<E>::executor_slice> */
+    /* struct is_two_way_executor<typename aggregated_executor<E>::executor_slice> */
     /*   : std::true_type */
     /* {}; */
 
@@ -1147,12 +1032,12 @@ namespace hpx { namespace parallel { namespace execution {
     // Workaround for the meantime: Manually create traits for compatible types:
 template <>
 struct is_one_way_executor<
-    typename cppuddle::kernel_aggregation::Aggregated_Executor<
+    typename cppuddle::kernel_aggregation::detail::aggregated_executor<
         hpx::cuda::experimental::cuda_executor>::executor_slice>
     : std::true_type {};
 template <>
 struct is_two_way_executor<
-    typename cppuddle::kernel_aggregation::Aggregated_Executor<
+    typename cppuddle::kernel_aggregation::detail::aggregated_executor<
         hpx::cuda::experimental::cuda_executor>::executor_slice>
     : std::true_type {};
 #endif
diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp
new file mode 100644
index 00000000..c7a3b633
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef KERNEL_AGGREGATION_INTERFACE_HPP
+#define KERNEL_AGGREGATION_INTERFACE_HPP
+
+#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp"
+#include "cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp"
+
+namespace cppuddle {
+namespace kernel_aggregation {
+
+using aggregated_executor_modes =
+    cppuddle::kernel_aggregation::detail::aggregated_executor_modes;
+
+template <typename T, typename Host_Allocator, typename Executor>
+using allocator_slice =
+    cppuddle::kernel_aggregation::detail::allocator_slice<T, Host_Allocator, Executor>;
+
+template <typename Executor>
+using aggregated_executor =
+    cppuddle::kernel_aggregation::detail::aggregated_executor<Executor>;
+
+template <const char *kernelname, class Interface, class Pool>
+using aggregation_pool =
+    cppuddle::kernel_aggregation::detail::aggregation_pool<kernelname, Interface,
+    Pool>;
+
+} // namespace kernel_aggregation 
+} // namespace cppuddle
+
+#endif
diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp
index fed34626..7bb455b0 100644
--- a/tests/work_aggregation_cpu_triad.cpp
+++ b/tests/work_aggregation_cpu_triad.cpp
@@ -5,9 +5,8 @@
 #include <hpx/futures/future.hpp>
 #undef NDEBUG
 
-
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 #include <boost/program_options.hpp>
 
@@ -101,7 +100,8 @@ int hpx_main(int argc, char *argv[]) {
   size_t number_underlying_executors{0};
   bool print_launch_counter{false};
   std::string executor_type_string{};
-  Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER};
+  cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{
+      cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
   std::string filename{};
   {
     try {
@@ -161,11 +161,11 @@ int hpx_main(int argc, char *argv[]) {
         return hpx::finalize();
       }
       if (executor_type_string == "EAGER") {
-        executor_mode = Aggregated_Executor_Modes::EAGER;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER;
       } else if (executor_type_string == "STRICT") {
-        executor_mode = Aggregated_Executor_Modes::STRICT;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT;
       } else if (executor_type_string == "ENDLESS") {
-        executor_mode = Aggregated_Executor_Modes::ENDLESS;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS;
       } else {
         std::cerr << "ERROR: Unknown executor mode " << executor_type_string
                   << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl;
@@ -183,7 +183,7 @@ int hpx_main(int argc, char *argv[]) {
   stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(
       number_underlying_executors);
   static const char kernelname[] = "cpu_triad";
-  using executor_pool = aggregation_pool<kernelname, Dummy_Executor,
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<kernelname, Dummy_Executor,
                                          round_robin_pool<Dummy_Executor>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 

From e086ab666921e558169229dc1527f1409a258490 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 01:21:01 +0100
Subject: [PATCH 12/19] Separate headers for underlying allocators

---
 .../cuda_recycling_allocators.hpp             |  92 +--------------
 .../detail/cuda_underlying_allocators.hpp     | 101 +++++++++++++++++
 .../detail/hip_underlying_allocators.hpp      | 107 ++++++++++++++++++
 .../detail/sycl_underlying_allocators.hpp     |  74 ++++++++++++
 .../hip_recycling_allocators.hpp              |  98 +---------------
 .../sycl_recycling_allocators.hpp             |  64 +----------
 6 files changed, 290 insertions(+), 246 deletions(-)
 create mode 100644 include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp
 create mode 100644 include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
 create mode 100644 include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp

diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
index 911948a3..7297955f 100644
--- a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
@@ -6,100 +6,14 @@
 #ifndef CUDA_RECYCLING_ALLOCATORS_HPP
 #define CUDA_RECYCLING_ALLOCATORS_HPP
 
-#include <cuda_runtime.h>
-#include <stdexcept>
-#include <string>
-
 #include "buffer_management_interface.hpp"
+// import cuda_pinned_allocator and cuda_device_allocator
+#include "detail/cuda_underlying_allocators.hpp"
 
 namespace cppuddle {
 namespace memory_recycling {
 
-namespace detail {
-/// Underlying host allocator for CUDA pinned memory
-template <class T> struct cuda_pinned_allocator {
-  using value_type = T;
-  cuda_pinned_allocator() noexcept = default;
-  template <class U>
-  explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    cudaError_t error =
-        cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_pinned_allocator failed due to cudaMallocHost failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    cudaError_t error = cudaFreeHost(p);
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_pinned_allocator failed due to cudaFreeHost failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-
-template <class T, class U>
-constexpr bool operator==(cuda_pinned_allocator<T> const &,
-                          cuda_pinned_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(cuda_pinned_allocator<T> const &,
-                          cuda_pinned_allocator<U> const &) noexcept {
-  return false;
-}
-
-/// Underlying allocator for CUDA device memory
-template <class T> struct cuda_device_allocator {
-  using value_type = T;
-  cuda_device_allocator() noexcept = default;
-  template <class U>
-  explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    cudaError_t error = cudaMalloc(&data, n * sizeof(T));
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_device_allocator failed due to cudaMalloc failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    cudaError_t error = cudaFree(p);
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_device_allocator failed due to cudaFree failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
-  return false;
-}
-} // end namespace detail
-
-
+// Tell cppuddle how to select the device for the cuda allocators
 namespace device_selection {
 /// GPU device selector using the CUDA API for pinned host allocations
 template <typename T>
diff --git a/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp
new file mode 100644
index 00000000..ab1f8681
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp
@@ -0,0 +1,101 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_UNDERLYING_ALLOCATORS_HPP
+#define CUDA_UNDERLYING_ALLOCATORS_HPP
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for CUDA pinned memory
+template <class T> struct cuda_pinned_allocator {
+  using value_type = T;
+  cuda_pinned_allocator() noexcept = default;
+  template <class U>
+  explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    cudaError_t error =
+        cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_pinned_allocator failed due to cudaMallocHost failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    cudaError_t error = cudaFreeHost(p);
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_pinned_allocator failed due to cudaFreeHost failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+
+template <class T, class U>
+constexpr bool operator==(cuda_pinned_allocator<T> const &,
+                          cuda_pinned_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(cuda_pinned_allocator<T> const &,
+                          cuda_pinned_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for CUDA device memory
+template <class T> struct cuda_device_allocator {
+  using value_type = T;
+  cuda_device_allocator() noexcept = default;
+  template <class U>
+  explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    cudaError_t error = cudaMalloc(&data, n * sizeof(T));
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_device_allocator failed due to cudaMalloc failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    cudaError_t error = cudaFree(p);
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_device_allocator failed due to cudaFree failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
+  return false;
+}
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
new file mode 100644
index 00000000..6668feaf
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_UNDERLYING_ALLOCATORS_HPP
+#define HIP_UNDERLYING_ALLOCATORS_HPP
+
+#include <hip/hip_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for HIP pinned memory
+template <class T> struct hip_pinned_allocator {
+  using value_type = T;
+  hip_pinned_allocator() noexcept = default;
+  template <class U>
+  explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    // hipError_t error =
+    //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
+    
+    // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc 
+    // (not hipMallocHost) for async memcpys 
+    // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync
+    hipError_t error =
+        hipHostMalloc(reinterpret_cast<void **>(&data), n * sizeof(T));
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_pinned_allocator failed due to hipMallocHost failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    hipError_t error = hipHostFree(p);
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_pinned_allocator failed due to hipFreeHost failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(hip_pinned_allocator<T> const &,
+                          hip_pinned_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(hip_pinned_allocator<T> const &,
+                          hip_pinned_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for HIP device memory
+template <class T> struct hip_device_allocator {
+  using value_type = T;
+  hip_device_allocator() noexcept = default;
+  template <class U>
+  explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    hipError_t error = hipMalloc(&data, n * sizeof(T));
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_device_allocator failed due to hipMalloc failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    hipError_t error = hipFree(p);
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_device_allocator failed due to hipFree failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(hip_device_allocator<T> const &,
+                          hip_device_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(hip_device_allocator<T> const &,
+                          hip_device_allocator<U> const &) noexcept {
+  return false;
+}
+
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
new file mode 100644
index 00000000..1597eee7
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef SYCL_UNDERLYING_ALLOCATORS_HPP
+#define SYCL_UNDERLYING_ALLOCATORS_HPP
+
+#include <CL/sycl.hpp>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{})
+template <class T> struct sycl_host_default_allocator {
+  using value_type = T;
+  sycl_host_default_allocator() noexcept = default;
+  template <class U>
+  explicit sycl_host_default_allocator(sycl_host_default_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    T *data = cl::sycl::malloc_host<T>(n, default_queue);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    cl::sycl::free(p, default_queue);
+  }
+};
+template <class T, class U>
+constexpr bool operator==(sycl_host_default_allocator<T> const &,
+                          sycl_host_default_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(sycl_host_default_allocator<T> const &,
+                          sycl_host_default_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for SYCL device memory (using the sycl::default_selector{})
+template <class T> struct sycl_device_default_allocator {
+  using value_type = T;
+  sycl_device_default_allocator() noexcept = default;
+  template <class U>
+  explicit sycl_device_default_allocator(sycl_device_default_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    T *data = cl::sycl::malloc_device<T>(n, default_queue);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    cl::sycl::free(p, default_queue);
+  }
+};
+template <class T, class U>
+constexpr bool operator==(sycl_device_default_allocator<T> const &,
+                          sycl_device_default_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(sycl_device_default_allocator<T> const &,
+                          sycl_device_default_allocator<U> const &) noexcept {
+  return false;
+}
+
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
index 36432820..e506ee2c 100644
--- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
@@ -6,106 +6,14 @@
 #ifndef HIP_RECYCLING_ALLOCATORS_HPP
 #define HIP_RECYCLING_ALLOCATORS_HPP
 
-#include <hip/hip_runtime.h>
-#include <stdexcept>
-#include <string>
-
 #include "buffer_management_interface.hpp"
+// import hip_pinned_allocator and hip_device_allocator
+#include "detail/hip_underlying_allocators.hpp"
 
 namespace cppuddle {
 namespace memory_recycling {
 
-namespace detail {
-/// Underlying host allocator for HIP pinned memory
-template <class T> struct hip_pinned_allocator {
-  using value_type = T;
-  hip_pinned_allocator() noexcept = default;
-  template <class U>
-  explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    // hipError_t error =
-    //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
-    
-    // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc 
-    // (not hipMallocHost) for async memcpys 
-    // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync
-    hipError_t error =
-        hipHostMalloc(reinterpret_cast<void **>(&data), n * sizeof(T));
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_pinned_allocator failed due to hipMallocHost failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    hipError_t error = hipHostFree(p);
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_pinned_allocator failed due to hipFreeHost failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(hip_pinned_allocator<T> const &,
-                          hip_pinned_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(hip_pinned_allocator<T> const &,
-                          hip_pinned_allocator<U> const &) noexcept {
-  return false;
-}
-
-/// Underlying allocator for HIP device memory
-template <class T> struct hip_device_allocator {
-  using value_type = T;
-  hip_device_allocator() noexcept = default;
-  template <class U>
-  explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    hipError_t error = hipMalloc(&data, n * sizeof(T));
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_device_allocator failed due to hipMalloc failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    hipError_t error = hipFree(p);
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_device_allocator failed due to hipFree failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(hip_device_allocator<T> const &,
-                          hip_device_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(hip_device_allocator<T> const &,
-                          hip_device_allocator<U> const &) noexcept {
-  return false;
-}
-
-} // end namespace detail
-
-
+// Tell cppuddle how to select the device for the hip allocators
 namespace device_selection {
 /// GPU device selector using the HIP API for pinned host allocations
 template <typename T>
diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
index 7ea9999c..fd494bca 100644
--- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
@@ -6,77 +6,17 @@
 #ifndef SYCL_RECYCLING_ALLOCATORS_HPP
 #define SYCL_RECYCLING_ALLOCATORS_HPP
 
-#include <CL/sycl.hpp>
-#include <stdexcept>
-#include <string>
-
 #include "buffer_management_interface.hpp"
+#include "detail/sycl_underlying_allocators.hpp"
 
 namespace cppuddle {
 namespace memory_recycling {
 
 namespace device_selection {
 // No MutliGPU support yet, hence no select_device_function required
-static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
+static_assert(max_number_gpus <= 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
 } // namespace device_selection
 
-namespace detail {
-/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{})
-template <class T> struct sycl_host_default_allocator {
-  using value_type = T;
-  sycl_host_default_allocator() noexcept = default;
-  template <class U>
-  explicit sycl_host_default_allocator(sycl_host_default_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    T *data = cl::sycl::malloc_host<T>(n, default_queue);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    cl::sycl::free(p, default_queue);
-  }
-};
-template <class T, class U>
-constexpr bool operator==(sycl_host_default_allocator<T> const &,
-                          sycl_host_default_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(sycl_host_default_allocator<T> const &,
-                          sycl_host_default_allocator<U> const &) noexcept {
-  return false;
-}
-
-/// Underlying allocator for SYCL device memory (using the sycl::default_selector{})
-template <class T> struct sycl_device_default_allocator {
-  using value_type = T;
-  sycl_device_default_allocator() noexcept = default;
-  template <class U>
-  explicit sycl_device_default_allocator(sycl_device_default_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    T *data = cl::sycl::malloc_device<T>(n, default_queue);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    cl::sycl::free(p, default_queue);
-  }
-};
-template <class T, class U>
-constexpr bool operator==(sycl_device_default_allocator<T> const &,
-                          sycl_device_default_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(sycl_device_default_allocator<T> const &,
-                          sycl_device_default_allocator<U> const &) noexcept {
-  return false;
-}
-
-} // end namespace detail
-
 /// Recycling allocator for SYCL pinned host memory (default device)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_host =

From a3fdeeddfdffde7e58137177dd6022f39df0b824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 01:21:21 +0100
Subject: [PATCH 13/19] Adapt aggregation tests to interface changes

---
 tests/work_aggregation_cpu_triad.cpp  |  1 -
 tests/work_aggregation_cuda_triad.cpp | 46 ++++++++++++++-------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp
index 7bb455b0..98159ea0 100644
--- a/tests/work_aggregation_cpu_triad.cpp
+++ b/tests/work_aggregation_cpu_triad.cpp
@@ -5,7 +5,6 @@
 #include <hpx/futures/future.hpp>
 #undef NDEBUG
 
-#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
 #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 #include <boost/program_options.hpp>
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index f3f6ec92..63596423 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -7,11 +7,11 @@
 //#undef NDEBUG
 
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-
 #include <boost/program_options.hpp>
 
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
+
 
 
 //===============================================================================
@@ -19,12 +19,14 @@
 // Stream benchmark
 
 template <typename float_t>
-__global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_t *B, const float_t *C, const float_t scalar, const size_t start_id, const size_t kernel_size, const size_t problem_size) {
+__global__ void __launch_bounds__(1024, 2)
+    triad_kernel(float_t *A, const float_t *B, const float_t *C,
+                 const float_t scalar, const size_t start_id,
+                 const size_t kernel_size, const size_t problem_size) {
   const size_t i = start_id + blockIdx.x * blockDim.x + threadIdx.x;
   A[i] = B[i] + scalar * C[i];
 }
 
-
 //===============================================================================
 //===============================================================================
 int hpx_main(int argc, char *argv[]) {
@@ -37,7 +39,8 @@ int hpx_main(int argc, char *argv[]) {
   size_t number_underlying_executors{0};
   bool print_launch_counter{false};
   std::string executor_type_string{};
-  Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER};
+  cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{
+      cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
   std::string filename{};
   {
     try {
@@ -97,11 +100,11 @@ int hpx_main(int argc, char *argv[]) {
         return hpx::finalize();
       }
       if (executor_type_string == "EAGER") {
-        executor_mode = Aggregated_Executor_Modes::EAGER;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER;
       } else if (executor_type_string == "STRICT") {
-        executor_mode = Aggregated_Executor_Modes::STRICT;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT;
       } else if (executor_type_string == "ENDLESS") {
-        executor_mode = Aggregated_Executor_Modes::ENDLESS;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS;
       } else {
         std::cerr << "ERROR: Unknown executor mode " << executor_type_string
                   << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl;
@@ -122,7 +125,7 @@ int hpx_main(int argc, char *argv[]) {
   stream_pool::init<executor_t, round_robin_pool<executor_t>>(
       number_underlying_executors, 0, true);
   static const char kernelname2[] = "cuda_triad";
-  using executor_pool = aggregation_pool<kernelname2, executor_t,
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<kernelname2, executor_t,
                                          round_robin_pool<executor_t>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 
@@ -147,9 +150,9 @@ int hpx_main(int argc, char *argv[]) {
     std::vector<float_t> A(problem_size, 0.0);
     std::vector<float_t> B(problem_size, 2.0);
     std::vector<float_t> C(problem_size, 1.0);
-    recycler::cuda_device_buffer<float_t> device_A(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_B(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_C(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_A(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_B(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_C(problem_size, 0);
     cudaMemcpy(device_A.device_side_buffer, A.data(),
                problem_size * sizeof(float_t), cudaMemcpyHostToDevice);
     cudaMemcpy(device_B.device_side_buffer, B.data(),
@@ -196,17 +199,16 @@ int hpx_main(int argc, char *argv[]) {
                   auto slice_exec = fut.get();
 
                   auto alloc_host = slice_exec.template make_allocator<
-                      float_t, recycler::detail::cuda_pinned_allocator<float_t>>();
+                      float_t, cppuddle::memory_recycling::detail::cuda_pinned_allocator<float_t>>();
                   auto alloc_device = slice_exec.template make_allocator<
-                      float_t, recycler::detail::cuda_device_allocator<float_t>>();
+                      float_t, cppuddle::memory_recycling::detail::cuda_device_allocator<float_t>>();
 
                   // Start the actual task
 
-                  // todo -- one slice gets a buffer that's not vaild anymore
                   std::vector<float_t, decltype(alloc_host)> local_A(
                       slice_exec.number_slices * kernel_size, float_t{}, alloc_host);
 
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_A(slice_exec.number_slices * kernel_size, 
                                alloc_device);
@@ -214,7 +216,7 @@ int hpx_main(int argc, char *argv[]) {
                   std::vector<float_t, decltype(alloc_host)> local_B(
                       slice_exec.number_slices * kernel_size, float_t{},
                       alloc_host);
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_B(slice_exec.number_slices * kernel_size, 
                                alloc_device);
@@ -222,7 +224,7 @@ int hpx_main(int argc, char *argv[]) {
                   std::vector<float_t, decltype(alloc_host)> local_C(
                       slice_exec.number_slices * kernel_size, float_t{},
                       alloc_host);
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_C(slice_exec.number_slices * kernel_size,
                                alloc_device);
@@ -317,9 +319,9 @@ int hpx_main(int argc, char *argv[]) {
     std::vector<float_t> A(problem_size, 0.0);
     std::vector<float_t> B(problem_size, 2.0);
     std::vector<float_t> C(problem_size, 1.0);
-    recycler::cuda_device_buffer<float_t> device_A(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_B(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_C(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_A(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_B(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_C(problem_size, 0);
     cudaMemcpy(device_A.device_side_buffer, A.data(),
                problem_size * sizeof(float_t), cudaMemcpyHostToDevice);
     cudaMemcpy(device_B.device_side_buffer, B.data(),

From 7d8e428a59ec8545f451b19d072cf1739c64ca45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 02:19:47 +0100
Subject: [PATCH 14/19] Separate hip/cuda buffer into their own headers

Also contains a fix for the aggregation failure test
(just re-enabling the test by defining
DEBUG_AGGREGATION_CALLS within the test itself)
---
 .../aggregation_executors_and_allocators.hpp  |  4 ++
 .../cuda_recycling_allocators.hpp             | 48 --------------
 .../hip_recycling_allocators.hpp              | 48 --------------
 .../util/cuda_recycling_device_buffer.hpp     | 66 +++++++++++++++++++
 .../util/hip_recycling_device_buffer.hpp      | 65 ++++++++++++++++++
 .../{ => util}/recycling_kokkos_view.hpp      |  2 +-
 include/cuda_buffer_util.hpp                  |  1 +
 include/hip_buffer_util.hpp                   |  3 +-
 include/kokkos_buffer_util.hpp                |  2 +-
 ...llocator_kokkos_executor_for_loop_test.cpp |  2 +-
 tests/allocator_kokkos_test.cpp               |  2 +-
 tests/stream_test.hpp                         |  1 +
 tests/work_aggregation_cuda_triad.cpp         |  3 +-
 tests/work_aggregation_test.cpp               | 46 +++++++------
 14 files changed, 171 insertions(+), 122 deletions(-)
 create mode 100644 include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp
 create mode 100644 include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp
 rename include/cppuddle/memory_recycling/{ => util}/recycling_kokkos_view.hpp (98%)

diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
index 5826c2c3..43f3c681 100644
--- a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
@@ -11,6 +11,10 @@
 #endif
 
 #include <stdexcept>
+// When defined, CPPuddle will run more checks
+// about the order of aggregated method calls. 
+// Best defined before including this header when needed
+// (hence commented out here)
 //#define DEBUG_AGGREGATION_CALLS 1
 
 #include <stdio.h>
diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
index 7297955f..b47a4fe2 100644
--- a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
@@ -36,54 +36,6 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_cuda_device =
     detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
 
-/// RAII wrapper for CUDA device memory
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct cuda_device_buffer {
-  recycle_allocator_cuda_device<T> allocator;
-  T *device_side_buffer;
-  size_t number_of_elements;
-
-  cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0)
-      : allocator{device_id}, number_of_elements(number_of_elements) {
-    assert(device_id < max_number_gpus);
-    device_side_buffer =
-        allocator.allocate(number_of_elements);
-  }
-  ~cuda_device_buffer() {
-    allocator.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  cuda_device_buffer(cuda_device_buffer const &other) = delete;
-  cuda_device_buffer operator=(cuda_device_buffer const &other) = delete;
-  cuda_device_buffer(cuda_device_buffer const &&other) = delete;
-  cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete;
-
-};
-
-/// RAII wrapper for CUDA device memory using a passed aggregated allocator
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct cuda_aggregated_device_buffer {
-  T *device_side_buffer;
-  size_t number_of_elements;
-  cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
-      : number_of_elements(number_of_elements), alloc(alloc) {
-    device_side_buffer =
-        alloc.allocate(number_of_elements);
-  }
-  ~cuda_aggregated_device_buffer() {
-    alloc.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete;
-  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete;
-  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete;
-  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete;
-
-private:
-  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
-                         // for the entire lifetime of this buffer
-};
-
 } // namespace memory_recycling
 } // end namespace cppuddle
 #endif
diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
index e506ee2c..13b5241b 100644
--- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
+++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
@@ -36,54 +36,6 @@ template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_hip_device =
     detail::recycle_allocator<T, detail::hip_device_allocator<T>>;
 
-/// RAII wrapper for HIP device memory
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct hip_device_buffer {
-  recycle_allocator_hip_device<T> allocator;
-  T *device_side_buffer;
-  size_t number_of_elements;
-
-  hip_device_buffer(size_t number_of_elements, size_t device_id)
-      : allocator{device_id}, number_of_elements(number_of_elements) {
-    assert(device_id < max_number_gpus);
-    device_side_buffer =
-        allocator.allocate(number_of_elements);
-  }
-  ~hip_device_buffer() {
-    allocator.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  hip_device_buffer(hip_device_buffer const &other) = delete;
-  hip_device_buffer operator=(hip_device_buffer const &other) = delete;
-  hip_device_buffer(hip_device_buffer const &&other) = delete;
-  hip_device_buffer operator=(hip_device_buffer const &&other) = delete;
-
-};
-
-/// RAII wrapper for CUDA device memory using a passed aggregated allocator
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct hip_aggregated_device_buffer {
-  T *device_side_buffer;
-  size_t number_of_elements;
-  hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
-      : number_of_elements(number_of_elements), alloc(alloc) {
-    device_side_buffer =
-        alloc.allocate(number_of_elements);
-  }
-  ~hip_aggregated_device_buffer() {
-    alloc.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete;
-  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete;
-  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete;
-  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete;
-
-private:
-  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
-                         // for the entire lifetime of this buffer
-};
-
 } // namespace memory_recycling
 } // end namespace cppuddle
 #endif
diff --git a/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp
new file mode 100644
index 00000000..dbd7e4c8
--- /dev/null
+++ b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp
@@ -0,0 +1,66 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_RECYCLING_BUFFER_HPP
+#define CUDA_RECYCLING_BUFFER_HPP
+
+// import recycle_allocator_cuda_device
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp""
+
+namespace cppuddle {
+namespace memory_recycling {
+
+
+/// RAII wrapper for CUDA device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct cuda_device_buffer {
+  recycle_allocator_cuda_device<T> allocator;
+  T *device_side_buffer;
+  size_t number_of_elements;
+
+  cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
+    device_side_buffer =
+        allocator.allocate(number_of_elements);
+  }
+  ~cuda_device_buffer() {
+    allocator.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  cuda_device_buffer(cuda_device_buffer const &other) = delete;
+  cuda_device_buffer operator=(cuda_device_buffer const &other) = delete;
+  cuda_device_buffer(cuda_device_buffer const &&other) = delete;
+  cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete;
+
+};
+
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct cuda_aggregated_device_buffer {
+  T *device_side_buffer;
+  size_t number_of_elements;
+  cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
+    device_side_buffer =
+        alloc.allocate(number_of_elements);
+  }
+  ~cuda_aggregated_device_buffer() {
+    alloc.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete;
+  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete;
+  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete;
+  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete;
+
+private:
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
+};
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp
new file mode 100644
index 00000000..7f04e3f7
--- /dev/null
+++ b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_RECYCLING_BUFFER_HPP
+#define HIP_RECYCLING_BUFFER_HPP
+
+// import recycle_allocator_hip_device
+#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+/// RAII wrapper for HIP device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct hip_device_buffer {
+  recycle_allocator_hip_device<T> allocator;
+  T *device_side_buffer;
+  size_t number_of_elements;
+
+  hip_device_buffer(size_t number_of_elements, size_t device_id)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
+    device_side_buffer =
+        allocator.allocate(number_of_elements);
+  }
+  ~hip_device_buffer() {
+    allocator.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  hip_device_buffer(hip_device_buffer const &other) = delete;
+  hip_device_buffer operator=(hip_device_buffer const &other) = delete;
+  hip_device_buffer(hip_device_buffer const &&other) = delete;
+  hip_device_buffer operator=(hip_device_buffer const &&other) = delete;
+
+};
+
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct hip_aggregated_device_buffer {
+  T *device_side_buffer;
+  size_t number_of_elements;
+  hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
+    device_side_buffer =
+        alloc.allocate(number_of_elements);
+  }
+  ~hip_aggregated_device_buffer() {
+    alloc.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete;
+  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete;
+  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete;
+  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete;
+
+private:
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
+};
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
similarity index 98%
rename from include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
rename to include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
index 98ce2799..1f0ed950 100644
--- a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp
+++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
@@ -9,7 +9,7 @@
 #include <memory>
 #include <type_traits>
 
-#include "buffer_management_interface.hpp"
+#include "cppuddle/memory_recycling/buffer_management_interface.hpp"
 
 
 namespace cppuddle {
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 8d004bef..7aa44c9a 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -8,6 +8,7 @@
 
 #include "buffer_manager.hpp"
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
 
 namespace recycler {
 namespace detail {
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index 3f0b3034..dfd31cdc 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -6,7 +6,8 @@
 #ifndef HIP_BUFFER_UTIL_HPP
 #define HIP_BUFFER_UTIL_HPP
 
-#include "/cppuddle/memory_recycling/hip_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp"
 
 namespace recycler {
 
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 54736ebe..66d1f8c4 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -5,7 +5,7 @@
 
 #ifndef KOKKOS_BUFFER_UTIL_HPP
 #define KOKKOS_BUFFER_UTIL_HPP
-#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
 
 
 namespace recycler {
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index c38294d7..ad184ff5 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -23,7 +23,7 @@
 
 #include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
-#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
 
 // Assert during Release builds as well for this file:
 #undef NDEBUG
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index 5fb780e5..e231b557 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -23,7 +23,7 @@
 
 #include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
-#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp"
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
 
 using kokkos_array =
     Kokkos::View<float[1000], Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index b793fe9c..63f25b27 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -10,6 +10,7 @@
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
 #include "cppuddle/executor_recycling/executor_pools_interface.hpp""
 
 template <typename Interface, typename Pool, typename... Ts>
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index 63596423..f04e04a4 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -10,10 +10,9 @@
 #include <boost/program_options.hpp>
 
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
 #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
-
-
 //===============================================================================
 //===============================================================================
 // Stream benchmark
diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp
index 25455633..7f5664f5 100644
--- a/tests/work_aggregation_test.cpp
+++ b/tests/work_aggregation_test.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2022 Gregor Daiß
+// Copyright (c) 2022-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -9,11 +9,15 @@
 #include <hpx/async_base/async.hpp>
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
 
 #include <boost/program_options.hpp>
 
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
+#define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are 
+                                  // compatible across all participating tasks
+                                  // Must be defined before including the aggregation:
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 //===============================================================================
 //===============================================================================
@@ -114,9 +118,9 @@ namespace hpx { namespace parallel { namespace execution {
 
 void sequential_test(void) {
   static const char kernelname[] = "kernel1";
-  using kernel_pool1 = aggregation_pool<kernelname, Dummy_Executor,
+  using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool<kernelname, Dummy_Executor,
                                         round_robin_pool<Dummy_Executor>>;
-  kernel_pool1::init(8, 2, Aggregated_Executor_Modes::STRICT);
+  kernel_pool1::init(8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   // Sequential test
   hpx::cout << "Sequential test with all executor slices" << std::endl;
   hpx::cout << "----------------------------------------" << std::endl;
@@ -260,8 +264,8 @@ void interruption_test(void) {
   hpx::cout << "Sequential test with interruption:" << std::endl;
   hpx::cout << "----------------------------------" << std::endl;
   {
-    Aggregated_Executor<Dummy_Executor> agg_exec{
-        4, Aggregated_Executor_Modes::EAGER};
+    cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor> agg_exec{
+        4, cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 
     auto slice_fut1 = agg_exec.request_executor_slice();
@@ -326,8 +330,8 @@ void failure_test(bool type_error) {
   hpx::cout << "------------------------------------------------------"
             << std::endl;
   {
-    Aggregated_Executor<Dummy_Executor> agg_exec{
-        4, Aggregated_Executor_Modes::STRICT};
+    cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor> agg_exec{
+        4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT};
 
     auto slice_fut1 = agg_exec.request_executor_slice();
 
@@ -405,9 +409,10 @@ void pointer_add_test(void) {
   hpx::cout << "--------------------------------------------------------"
             << std::endl;
   static const char kernelname2[] = "kernel2";
-  using kernel_pool2 = aggregation_pool<kernelname2, Dummy_Executor,
-                                        round_robin_pool<Dummy_Executor>>;
-  kernel_pool2::init(8, 2, Aggregated_Executor_Modes::STRICT);
+  using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname2, Dummy_Executor, round_robin_pool<Dummy_Executor>>;
+  kernel_pool2::init(
+      8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   {
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
@@ -602,10 +607,11 @@ void references_add_test(void) {
   {
     /*Aggregated_Executor<decltype(executor1)> agg_exec{
         4, Aggregated_Executor_Modes::STRICT};*/
-    auto &agg_exec =
-        std::get<0>(stream_pool::get_interface<
-                    Aggregated_Executor<Dummy_Executor>,
-                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(0));
+    auto &agg_exec = std::get<0>(
+        stream_pool::get_interface<
+            cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
+            round_robin_pool<cppuddle::kernel_aggregation::aggregated_executor<
+                Dummy_Executor>>>(0));
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 
@@ -831,9 +837,11 @@ int hpx_main(int argc, char *argv[]) {
       8, 0, false);
   stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(8);
 
-  stream_pool::init<Aggregated_Executor<Dummy_Executor>,
-                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(
-      8, 4, Aggregated_Executor_Modes::STRICT);
+  stream_pool::init<
+      cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
+      round_robin_pool<
+          cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>>>(
+      8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   /*hpx::cuda::experimental::cuda_executor executor1 =
       std::get<0>(stream_pool::get_interface<
                   hpx::cuda::experimental::cuda_executor,

From 72b486c3430035e9b05502d6dddc9e5ffb4f30e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 02:55:28 +0100
Subject: [PATCH 15/19] Clean legacy calls from aggregation code

---
 .../aggregation_executors_and_allocators.hpp  | 25 ++++++++----
 tests/work_aggregation_cpu_triad.cpp          | 12 ++++--
 tests/work_aggregation_cuda_triad.cpp         | 11 ++++--
 tests/work_aggregation_test.cpp               | 38 +++++++++++--------
 4 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
index 43f3c681..dfc76622 100644
--- a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
@@ -52,9 +52,11 @@
 #include <boost/core/demangle.hpp>
 #include <boost/format.hpp>
 
-#include "../include/buffer_manager.hpp"
-#include "../include/stream_manager.hpp"
 #include "cppuddle/common/config.hpp"
+// get direct access to the buffer manangment
+#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
+// get normal access to the executor pools
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
 
 #ifndef CPPUDDLE_HAVE_HPX_MUTEX
 #pragma message                                                                \
@@ -406,7 +408,9 @@ template <typename Executor> class aggregated_executor {
   /// Wrapper to the executor interface from the stream pool
   /// Automatically hooks into the stream_pools reference counting
   /// for cpu/gpu load balancing
-  std::unique_ptr<stream_interface<Executor, round_robin_pool<Executor>>> executor_wrapper;
+  std::unique_ptr<cppuddle::executor_recycling::executor_interface<
+      Executor, cppuddle::executor_recycling::round_robin_pool_impl<Executor>>>
+      executor_wrapper;
 
 public:
   size_t gpu_id;
@@ -849,9 +853,14 @@ template <typename Executor> class aggregated_executor {
       if (local_slice_id == 1) {
         // Redraw executor
         assert(!executor_wrapper);
-        stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
+        cppuddle::executor_recycling::executor_pool::select_device<
+            Executor, cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(
+            gpu_id);
         executor_wrapper.reset(
-            new stream_interface<Executor, round_robin_pool<Executor>>(gpu_id));
+            new cppuddle::executor_recycling::executor_interface<
+                Executor,
+                cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(
+                gpu_id));
         // Renew promise that all slices will be ready as the primary launch
         // criteria...
         hpx::lcos::shared_future<void> fut;
@@ -860,8 +869,10 @@ template <typename Executor> class aggregated_executor {
           // Fallback launch condidtion: Launch as soon as the underlying stream
           // is ready
           /* auto slices_full_fut = slices_full_promise.get_future(); */
-          stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
-          auto exec_fut = (*executor_wrapper).get_future(); 
+          cppuddle::executor_recycling::executor_pool::select_device<
+              Executor,
+              cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(gpu_id);
+          auto exec_fut = (*executor_wrapper).get_future();
           /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */
           fut = std::move(exec_fut);
         } else {
diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp
index 98159ea0..d65c9668 100644
--- a/tests/work_aggregation_cpu_triad.cpp
+++ b/tests/work_aggregation_cpu_triad.cpp
@@ -5,6 +5,7 @@
 #include <hpx/futures/future.hpp>
 #undef NDEBUG
 
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 #include <boost/program_options.hpp>
@@ -179,11 +180,14 @@ int hpx_main(int argc, char *argv[]) {
     }
   }
 
-  stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(
+  cppuddle::executor_recycling::executor_pool::init<
+      Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>(
       number_underlying_executors);
   static const char kernelname[] = "cpu_triad";
-  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<kernelname, Dummy_Executor,
-                                         round_robin_pool<Dummy_Executor>>;
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 
   using float_t = float;
@@ -289,7 +293,7 @@ int hpx_main(int argc, char *argv[]) {
   std::flush(hpx::cout);
   sleep(1);
 
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }
 
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index f04e04a4..75f7ad14 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -121,11 +121,14 @@ int hpx_main(int argc, char *argv[]) {
   hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
 
   using executor_t = hpx::cuda::experimental::cuda_executor;
-  stream_pool::init<executor_t, round_robin_pool<executor_t>>(
+  cppuddle::executor_recycling::executor_pool::init<
+      executor_t,
+      cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(
       number_underlying_executors, 0, true);
   static const char kernelname2[] = "cuda_triad";
-  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<kernelname2, executor_t,
-                                         round_robin_pool<executor_t>>;
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname2, executor_t,
+      cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 
   using float_t = float;
@@ -418,7 +421,7 @@ int hpx_main(int argc, char *argv[]) {
   /* sleep(1); */
 
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }
 
diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp
index 7f5664f5..abe827f4 100644
--- a/tests/work_aggregation_test.cpp
+++ b/tests/work_aggregation_test.cpp
@@ -14,6 +14,7 @@
 
 #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
 #include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
 #define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are 
                                   // compatible across all participating tasks
                                   // Must be defined before including the aggregation:
@@ -118,9 +119,11 @@ namespace hpx { namespace parallel { namespace execution {
 
 void sequential_test(void) {
   static const char kernelname[] = "kernel1";
-  using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool<kernelname, Dummy_Executor,
-                                        round_robin_pool<Dummy_Executor>>;
-  kernel_pool1::init(8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
+  using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
+  kernel_pool1::init(
+      8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   // Sequential test
   hpx::cout << "Sequential test with all executor slices" << std::endl;
   hpx::cout << "----------------------------------------" << std::endl;
@@ -410,7 +413,8 @@ void pointer_add_test(void) {
             << std::endl;
   static const char kernelname2[] = "kernel2";
   using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool<
-      kernelname2, Dummy_Executor, round_robin_pool<Dummy_Executor>>;
+      kernelname2, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
   kernel_pool2::init(
       8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   {
@@ -608,10 +612,11 @@ void references_add_test(void) {
     /*Aggregated_Executor<decltype(executor1)> agg_exec{
         4, Aggregated_Executor_Modes::STRICT};*/
     auto &agg_exec = std::get<0>(
-        stream_pool::get_interface<
+        cppuddle::executor_recycling::executor_pool::get_interface<
             cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
-            round_robin_pool<cppuddle::kernel_aggregation::aggregated_executor<
-                Dummy_Executor>>>(0));
+            cppuddle::executor_recycling::round_robin_pool_impl<
+                cppuddle::kernel_aggregation::aggregated_executor<
+                    Dummy_Executor>>>(0));
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 
@@ -832,14 +837,17 @@ int hpx_main(int argc, char *argv[]) {
     return hpx::finalize();
   }
 
-  stream_pool::init<hpx::cuda::experimental::cuda_executor,
-                    round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      8, 0, false);
-  stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(8);
+  cppuddle::executor_recycling::executor_pool::init<
+      hpx::cuda::experimental::cuda_executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<
+          hpx::cuda::experimental::cuda_executor>>(8, 0, false);
+  cppuddle::executor_recycling::executor_pool::init<
+      Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>(8);
 
-  stream_pool::init<
+  cppuddle::executor_recycling::executor_pool::init<
       cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
-      round_robin_pool<
+      cppuddle::executor_recycling::round_robin_pool_impl<
           cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>>>(
       8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   /*hpx::cuda::experimental::cuda_executor executor1 =
@@ -871,8 +879,8 @@ int hpx_main(int argc, char *argv[]) {
   std::flush(hpx::cout);
   sleep(1);
 
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }
 

From 8ccbed09edab648059ff5d091f23f73050110032 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 02:59:53 +0100
Subject: [PATCH 16/19] Add deprecation file comments

---
 include/aggregation_manager.hpp | 4 ++++
 include/aligned_buffer_util.hpp | 4 ++++
 include/buffer_manager.hpp      | 4 ++++
 include/cuda_buffer_util.hpp    | 4 ++++
 include/hip_buffer_util.hpp     | 4 ++++
 include/kokkos_buffer_util.hpp  | 4 ++++
 include/stream_manager.hpp      | 4 ++++
 include/sycl_buffer_util.hpp    | 4 ++++
 8 files changed, 32 insertions(+)

diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index bb0fd83f..9b546cab 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef AGGREGATION_MANAGER_HPP
 #define AGGREGATION_MANAGER_HPP
 
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index 02a57104..64497a9d 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef ALIGNED_BUFFER_UTIL_HPP
 #define ALIGNED_BUFFER_UTIL_HPP
 
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index fb253990..baf807e4 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef BUFFER_MANAGER_HPP
 #define BUFFER_MANAGER_HPP
 
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 7aa44c9a..7fbd07be 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef CUDA_BUFFER_UTIL_HPP
 #define CUDA_BUFFER_UTIL_HPP
 
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index dfd31cdc..720baf70 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef HIP_BUFFER_UTIL_HPP
 #define HIP_BUFFER_UTIL_HPP
 
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 66d1f8c4..e84be4b6 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef KOKKOS_BUFFER_UTIL_HPP
 #define KOKKOS_BUFFER_UTIL_HPP
 #include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 25c4a080..1e781442 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef STREAM_MANAGER_HPP
 #define STREAM_MANAGER_HPP
 
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 7ce66d93..46922d17 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef SYCL_BUFFER_UTIL_HPP
 #define SYCL_BUFFER_UTIL_HPP
 

From aefd0f6b999998f81701a366a579e4650437ad97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 03:07:02 +0100
Subject: [PATCH 17/19] Begin cmakelist cleanup

---
 CMakeLists.txt | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a31995f..2212d40b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,6 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}
 
 # GPU-related options
 option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF)
-option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF)
 option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF)
 set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.")
 # HPX-related options
@@ -151,6 +150,9 @@ endif()
 # Define library targets and installation
 # (also includes various warnings for non-optimal build configurations)
 
+# TODO Cleanup targets:
+# this is leftover from the days where cppuddle was not header-only
+
 ## Interface targets
 add_library(buffer_manager INTERFACE)
 if (CPPUDDLE_WITH_HPX)
@@ -319,9 +321,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_test.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
 
         add_hpx_executable(
@@ -331,9 +330,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_cpu_triad.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
 
         add_hpx_executable(
@@ -343,9 +339,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_cuda_triad.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
         target_compile_definitions(work_aggregation_test PRIVATE CPPUDDLE_HAVE_CUDA)
       endif() # end WITH KOKKOS
@@ -359,11 +352,6 @@ if (CPPUDDLE_WITH_TESTS)
     add_compile_definitions(CPPUDDLE_WITH_HPX)
   endif()
 
-  if (CPPUDDLE_WITH_MULTIGPU_SUPPORT)
-    add_compile_definitions(CPPUDDLE_HAVE_MULTIGPU)
-    message(WARNING, " Multi-GPU Support not yet properly tested!")
-  endif()
-
 #------------------------------------------------------------------------------------------------------------
 # Define actual tests (usually running the binary and checking its output for certain patterns via regex)
 

From a60cccffac4eb0a39f9dff19292d26ecbca3b9f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Sat, 9 Mar 2024 04:11:01 +0100
Subject: [PATCH 18/19] Fix compatibility layer for deprecations

---
 include/buffer_manager.hpp                       | 16 +++++++++++++++-
 .../detail/hip_underlying_allocators.hpp         |  2 +-
 .../detail/sycl_underlying_allocators.hpp        |  2 +-
 .../util/recycling_kokkos_view.hpp               |  4 ++--
 include/kokkos_buffer_util.hpp                   |  4 ++--
 5 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index baf807e4..69020e5b 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -10,18 +10,27 @@
 #ifndef BUFFER_MANAGER_HPP
 #define BUFFER_MANAGER_HPP
 
+#include "cppuddle/common/config.hpp"
 #include "cppuddle/memory_recycling/buffer_management_interface.hpp"
+#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
 #include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 namespace recycler {
 
+namespace detail {
+using buffer_recycler [[deprecated(
+    "Use buffer_interface from header "
+    "cppuddle/memory_recycling/detail/buffer_management.hpp instead")]] =
+    cppuddle::memory_recycling::detail::buffer_interface;
+}
+
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_std
     [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
         cppuddle::memory_recycling::recycle_std<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using aggressive_recycle_aligned
+using aggressive_recycle_std
     [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
         cppuddle::memory_recycling::aggressive_recycle_std<T>;
 
@@ -41,6 +50,11 @@ inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); }
 [[deprecated("Use cppuddle::memory_recycling::finalize() instead")]]
 inline void finalize() { cppuddle::memory_recycling::finalize(); }
 
+[[deprecated("Use cppuddle::max_number_gpus instead")]] constexpr auto max_number_gpus =
+    cppuddle::max_number_gpus;
+[[deprecated("Use cppuddle::number_instances instead")]] constexpr auto number_instances =
+    cppuddle::number_instances;
+
 } // namespace recycler
 
 #endif
diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
index 6668feaf..bfd7c2e1 100644
--- a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
+++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2024 Gregor Daiß
+// Copyright (c) 2021-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
index 1597eee7..3e3c9173 100644
--- a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
+++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2024 Gregor Daiß
+// Copyright (c) 2023-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
diff --git a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
index 1f0ed950..b8ca526c 100644
--- a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
+++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef KOKKOS_BUFFER_UTIL_HPP
-#define KOKKOS_BUFFER_UTIL_HPP
+#ifndef RECYCLING_KOKKOS_VIEW_HPP
+#define RECYCLING_KOKKOS_VIEW_HPP
 #include <Kokkos_Core.hpp>
 #include <memory>
 #include <type_traits>
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index e84be4b6..716229a0 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -17,12 +17,12 @@ template <typename kokkos_type, typename alloc_type, typename element_type>
 using aggregated_recycled_view [[deprecated(
     "Use aggregated_recycle_view from header recycling_kokkos_view.hpp "
     "instead")]] =
-    cppuddle::aggregated_recycle_view<kokkos_type, alloc_type, element_type>;
+    cppuddle::memory_recycling::aggregated_recycling_view<kokkos_type, alloc_type, element_type>;
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
 using recycled_view [[deprecated(
     "Use recycle_view from header recycling_kokkos_view.hpp instead")]] =
-    cppuddle::recycle_view<kokkos_type, alloc_type, element_type>;
+    cppuddle::memory_recycling::recycling_view<kokkos_type, alloc_type, element_type>;
 
 } // end namespace recycler
 

From 1e719e7d913005a95b3f8f99b93d6b6c4a6f3eff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregor=20Dai=C3=9F?= <Gregor.Daiss+git@gmail.com>
Date: Mon, 11 Mar 2024 13:05:41 +0100
Subject: [PATCH 19/19] Fix sycl namespace

---
 include/sycl_buffer_util.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 46922d17..4bf45b3f 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -19,24 +19,24 @@ namespace detail {
 template <class T>
 using sycl_host_default_allocator
     [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::sycl_host_default_allocator<T>;
+        cppuddle::memory_recycling::detail::sycl_host_default_allocator<T>;
 
 template <class T>
 using sycl_device_default_allocator
     [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
-        cppuddle::detail::sycl_device_default_allocator<T>;
+        cppuddle::memory_recycling::detail::sycl_device_default_allocator<T>;
 
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_host
     [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_sycl_host<T>;
+        cppuddle::memory_recycling::recycle_allocator_sycl_host<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
 using recycle_allocator_sycl_device
     [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
-        cppuddle::recycle_allocator_sycl_device<T>;
+        cppuddle::memory_recycling::recycle_allocator_sycl_device<T>;
 
 } // end namespace recycler
 #endif