diff --git a/.gitignore b/.gitignore
index 2b25a731..bdc68a8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ spack.lock
 .clangd
 docs
 compile_commands.json
+spack-build*
+spack-configure-args.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a31995f..2212d40b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,6 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR}
 
 # GPU-related options
 option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF)
-option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF)
 option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF)
 set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.")
 # HPX-related options
@@ -151,6 +150,9 @@ endif()
 # Define library targets and installation
 # (also includes various warnings for non-optimal build configurations)
 
+# TODO Cleanup targets:
+# this is leftover from the days where cppuddle was not header-only
+
 ## Interface targets
 add_library(buffer_manager INTERFACE)
 if (CPPUDDLE_WITH_HPX)
@@ -319,9 +321,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_test.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
 
         add_hpx_executable(
@@ -331,9 +330,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_cpu_triad.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
 
         add_hpx_executable(
@@ -343,9 +339,6 @@ if (CPPUDDLE_WITH_TESTS)
           COMPONENT_DEPENDENCIES iostreams
           SOURCES
           tests/work_aggregation_cuda_triad.cpp
-          include/aggregation_manager.hpp
-          include/buffer_manager.hpp
-          include/stream_manager.hpp
           )
         target_compile_definitions(work_aggregation_test PRIVATE CPPUDDLE_HAVE_CUDA)
       endif() # end WITH KOKKOS
@@ -359,11 +352,6 @@ if (CPPUDDLE_WITH_TESTS)
     add_compile_definitions(CPPUDDLE_WITH_HPX)
   endif()
 
-  if (CPPUDDLE_WITH_MULTIGPU_SUPPORT)
-    add_compile_definitions(CPPUDDLE_HAVE_MULTIGPU)
-    message(WARNING, " Multi-GPU Support not yet properly tested!")
-  endif()
-
 #------------------------------------------------------------------------------------------------------------
 # Define actual tests (usually running the binary and checking its output for certain patterns via regex)
 
diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp
index 2aa92063..9b546cab 100644
--- a/include/aggregation_manager.hpp
+++ b/include/aggregation_manager.hpp
@@ -1,1151 +1,39 @@
-// Copyright (c) 2022-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef WORK_AGGREGATION_MANAGER
-#define WORK_AGGREGATION_MANAGER
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
 
-#ifndef CPPUDDLE_HAVE_HPX
-#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON"
-#endif
-
-#include <stdexcept>
-//#define DEBUG_AGGREGATION_CALLS 1
-
-#include <stdio.h>
-
-#include <any>
-#include <atomic>
-#include <chrono>
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include <unordered_map>
-
-#include <hpx/futures/future.hpp>
-#include <hpx/hpx_init.hpp>
-#include <hpx/include/async.hpp>
-#include <hpx/include/iostreams.hpp>
-#include <hpx/include/lcos.hpp>
-#include <hpx/lcos/promise.hpp>
-#include <hpx/mutex.hpp>
-
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-// required for defining type traits using cuda executor as underlying
-// aggregation executors
-#include <hpx/async_cuda/cuda_executor.hpp>
-#endif
-
-#include <boost/core/demangle.hpp>
-#include <boost/format.hpp>
-
-#include "../include/buffer_manager.hpp"
-#include "../include/stream_manager.hpp"
-#include "../include/detail/config.hpp"
-
-#ifndef CPPUDDLE_HAVE_HPX_MUTEX
-#pragma message                                                                \
-    "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF"
-#pragma message                                                                \
-    "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex"
-#endif
-namespace recycler {
-  using aggregation_mutex_t = hpx::mutex;
-}
-
-//===============================================================================
-//===============================================================================
-// Helper functions/classes
-
-/// Constructs a tuple with copies (to store temporaries in aggregated function
-/// calls) yet also supporting references (on the users own risk...)
-template <typename... Ts>
-std::tuple<Ts...> make_tuple_supporting_references(Ts &&...ts) {
-  return std::tuple<Ts...>{std::forward<Ts>(ts)...};
-}
-
-/// Print some specific values that we can, but don't bother for most types
-/// (such as vector)
-template <typename T> std::string print_if_possible(T val) {
-  if constexpr (std::is_convertible_v<T, std::string>) {
-    return val;
-  } else if constexpr (std::is_integral_v<T> || std::is_floating_point_v<T>) {
-    return std::to_string(val);
-  } else if constexpr (std::is_pointer_v<T>) {
-    // Pretty printing pointer sort of only works well with %p
-    // TODO Try using std::format as soon as we can move to C++20
-    std::unique_ptr<char[]> debug_string(new char[128]());
-    snprintf(debug_string.get(), 128, "%p", val);
-    return std::string(debug_string.get());
-  } else {
-    return std::string("cannot print value");
-  }
-}
-
-/// Helper class for the helper class that prints tuples -- do not use this
-/// directly
-template <class TupType, size_t... I>
-void print_tuple(const TupType &_tup, std::index_sequence<I...>) {
-  (..., (hpx::cout << (I == 0 ? "" : ", ")
-                   << print_if_possible(std::get<I + 1>(_tup))));
-}
-
-/// Helper class for printing tuples (first component should be a function
-/// pointer, remaining components the function arguments)
-template <class... T> void print_tuple(const std::tuple<T...> &_tup) {
-  // Use pointer and sprintf as boost::format refused to NOT cast the pointer
-  // address to 1...
-  // TODO Try using std::format as soon as we can move to C++20
-  std::unique_ptr<char[]> debug_string(new char[128]());
-  snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (",
-           std::get<0>(_tup));
-  hpx::cout << debug_string.get();
-  print_tuple(_tup, std::make_index_sequence<sizeof...(T) - 1>());
-  hpx::cout << ")";
-}
-
-//===============================================================================
-//===============================================================================
-template <typename Executor, typename F, typename... Ts>
-void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) {
-  hpx::apply(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
-}
-
-template <typename Executor, typename F, typename... Ts>
-hpx::lcos::future<void> exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) {
-  return hpx::async(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
-}
-
-/// Manages the launch conditions for aggregated function calls
-/// type/value-errors
-/** Launch conditions: All slice executors must have called the same function
- * (tracked by future all_slices_ready)
- * AND
- * Previous aggregated_function_call on the same Executor must have been
- * launched (tracked by future stream_future)
- * All function calls received from the slice executors are checked if they
- * match the first one in both types and values (throws exception otherwise)
- */
-
-template <typename Executor> class aggregated_function_call {
-private:
-  std::atomic<size_t> slice_counter = 0;
-
-  /// Promise to be set when all slices have visited this function call
-  /* hpx::lcos::local::promise<void> slices_ready_promise; */
-  /// Tracks if all slices have visited this function call
-  /* hpx::lcos::future<void> all_slices_ready = slices_ready_promise.get_future(); */
-  /// How many slices can we expect?
-  const size_t number_slices;
-  const bool async_mode;
-
-  Executor &underlying_executor;
-
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-#pragma message                                                                \
-    "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..."
-  /// Stores the function call of the first slice as reference for error
-  /// checking
-  std::any function_tuple;
-  /// Stores the string of the first function call for debug output
-  std::string debug_type_information;
-  recycler::aggregation_mutex_t debug_mut;
-#endif
-
-  std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
-
-public:
-  aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec)
-      : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) {
-    if (async_mode)
-      potential_async_promises.resize(number_slices);
-  }
-  ~aggregated_function_call(void) {
-    // All slices should have done this call
-    assert(slice_counter == number_slices);
-    // assert(!all_slices_ready.valid());
-  }
-  /// Returns true if all required slices have visited this point
-  bool sync_aggregation_slices(hpx::lcos::future<void> &stream_future) {
-    assert(!async_mode);
-    assert(potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    if (local_counter == number_slices - 1) {
-      return true;
-    }
-    else return false;
-  }
-  template <typename F, typename... Ts>
-  void post_when(hpx::lcos::future<void> &stream_future, F &&f, Ts &&...ts) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-    // needed for concurrent access to function_tuple and debug_type_information
-    // Not required for normal use
-    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
-#endif
-    assert(!async_mode);
-    assert(potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-
-    if (local_counter == 0) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto tmp_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      function_tuple = tmp_tuple;
-      debug_type_information = typeid(decltype(tmp_tuple)).name();
-#endif
-
-    } else {
-      //
-      // This scope checks if both the type and the values of the current call
-      // match the original call To be used in debug build...
-      //
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto comparison_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      try {
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        if (comparison_tuple != orig_call_tuple) {
-          throw std::runtime_error(
-              "Values of post function arguments (or function "
-              "itself) do not match ");
-        }
-      } catch (const std::bad_any_cast &e) {
-        hpx::cout
-            << "\nMismatched types error in aggregated post call of executor "
-            << ": " << e.what() << "\n";
-        hpx::cout << "Expected types:\t\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        hpx::cout << "\nGot types:\t\t "
-                  << boost::core::demangle(
-                         typeid(decltype(comparison_tuple)).name())
-                  << "\n"
-                  << std::endl;
-        // throw;
-      } catch (const std::runtime_error &e) {
-        hpx::cout
-            << "\nMismatched values error in aggregated post call of executor "
-            << ": " << e.what() << std::endl;
-        hpx::cout << "Types (matched):\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        hpx::cout << "\nExpected values:\t ";
-        print_tuple(orig_call_tuple);
-        hpx::cout << "\nGot values:\t\t ";
-        print_tuple(comparison_tuple);
-        hpx::cout << std::endl << std::endl;
-        // throw;
-      }
-#endif
-    }
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    // Check exit criteria: Launch function call continuation by setting the
-    // slices promise
-    if (local_counter == number_slices - 1) {
-      exec_post_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
-      //slices_ready_promise.set_value();
-    }
-  }
-  template <typename F, typename... Ts>
-  hpx::lcos::future<void> async_when(hpx::lcos::future<void> &stream_future,
-                                     F &&f, Ts &&...ts) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-    // needed for concurrent access to function_tuple and debug_type_information
-    // Not required for normal use
-    std::lock_guard<recycler::aggregation_mutex_t> guard(debug_mut);
-#endif
-    assert(async_mode);
-    assert(!potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    if (local_counter == 0) {
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto tmp_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      function_tuple = tmp_tuple;
-      debug_type_information = typeid(decltype(tmp_tuple)).name();
-#endif
-    } else {
-      //
-      // This scope checks if both the type and the values of the current call
-      // match the original call To be used in debug build...
-      //
-#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
-      auto comparison_tuple =
-          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
-      try {
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        if (comparison_tuple != orig_call_tuple) {
-          throw std::runtime_error(
-              "Values of async function arguments (or function "
-              "itself) do not match ");
-        }
-      } catch (const std::bad_any_cast &e) {
-        hpx::cout
-            << "\nMismatched types error in aggregated async call of executor "
-            << ": " << e.what() << "\n";
-        hpx::cout << "Expected types:\t\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        hpx::cout << "\nGot types:\t\t "
-                  << boost::core::demangle(
-                         typeid(decltype(comparison_tuple)).name())
-                  << "\n"
-                  << std::endl;
-        // throw;
-      } catch (const std::runtime_error &e) {
-        hpx::cout
-            << "\nMismatched values error in aggregated async call of executor "
-            << ": " << e.what() << std::endl;
-        hpx::cout << "Types (matched):\t "
-                  << boost::core::demangle(debug_type_information.c_str());
-        auto orig_call_tuple =
-            std::any_cast<decltype(comparison_tuple)>(function_tuple);
-        hpx::cout << "\nExpected values:\t ";
-        print_tuple(orig_call_tuple);
-        hpx::cout << "\nGot values:\t\t ";
-        print_tuple(comparison_tuple);
-        hpx::cout << std::endl << std::endl;
-        // throw;
-      }
-#endif
-    }
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    assert(potential_async_promises.size() == number_slices);
-    hpx::lcos::future<void> ret_fut =
-        potential_async_promises[local_counter].get_future();
-    if (local_counter == number_slices - 1) {
-      /* slices_ready_promise.set_value(); */
-      auto fut = exec_async_wrapper<Executor, F, Ts...>(
-          underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
-      fut.then([this](auto &&fut) {
-        for (auto &promise : potential_async_promises) {
-          promise.set_value();
-        }
-      });
-    }
-    // Check exit criteria: Launch function call continuation by setting the
-    // slices promise
-    return ret_fut;
-  }
-  template <typename F, typename... Ts>
-  hpx::lcos::shared_future<void> wrap_async(hpx::lcos::future<void> &stream_future,
-                                     F &&f, Ts &&...ts) {
-    assert(async_mode);
-    assert(!potential_async_promises.empty());
-    const size_t local_counter = slice_counter++;
-    assert(local_counter < number_slices);
-    assert(slice_counter < number_slices + 1);
-    assert(potential_async_promises.size() == number_slices);
-    hpx::lcos::shared_future<void> ret_fut =
-        potential_async_promises[local_counter].get_shared_future();
-    if (local_counter == number_slices - 1) {
-      auto fut = f(std::forward<Ts>(ts)...);
-      fut.then([this](auto &&fut) {
-        // TODO just use one promise
-        for (auto &promise : potential_async_promises) {
-          promise.set_value();
-        }
-      });
-    }
-    return ret_fut;
-  }
-  // We need to be able to copy or no-except move for std::vector..
-  aggregated_function_call(const aggregated_function_call &other) = default;
-  aggregated_function_call &
-  operator=(const aggregated_function_call &other) = default;
-  aggregated_function_call(aggregated_function_call &&other) = default;
-  aggregated_function_call &
-  operator=(aggregated_function_call &&other) = default;
-};
-
-//===============================================================================
-//===============================================================================
-
-enum class Aggregated_Executor_Modes { EAGER = 1, STRICT, ENDLESS };
-/// Declaration since the actual allocator is only defined after the Executors
-template <typename T, typename Host_Allocator, typename Executor>
-class Allocator_Slice;
-
-/// Executor Class that aggregates function calls for specific kernels
-/** Executor is not meant to be used directly. Instead it yields multiple
- * Executor_Slice objects. These serve as interfaces. Slices from the same
- * Aggregated_Executor are meant to execute the same function calls but on
- * different data (i.e. different tasks)
- */
-template <typename Executor> class Aggregated_Executor {
-private:
-  //===============================================================================
-  // Misc private avariables:
-  //
-  std::atomic<bool> slices_exhausted;
-
-  std::atomic<bool> executor_slices_alive;
-  std::atomic<bool> buffers_in_use;
-  std::atomic<size_t> dealloc_counter;
-
-  const Aggregated_Executor_Modes mode;
-  const size_t max_slices;
-  std::atomic<size_t> current_slices;
-  /// Wrapper to the executor interface from the stream pool
-  /// Automatically hooks into the stream_pools reference counting
-  /// for cpu/gpu load balancing
-  std::unique_ptr<stream_interface<Executor, round_robin_pool<Executor>>> executor_wrapper;
-
-public:
-  size_t gpu_id;
-  // Subclasses
-
-  /// Slice class - meant as a scope interface to the aggregated executor
-  class Executor_Slice {
-  public:
-    Aggregated_Executor<Executor> &parent;
-  private:
-    /// Executor is a slice of this aggregated_executor
-    /// How many functions have been called - required to enforce sequential
-    /// behaviour of kernel launches
-    size_t launch_counter{0};
-    size_t buffer_counter{0};
-    bool notify_parent_about_destruction{true};
-
-  public:
-    /// How many slices are there overall - required to check the launch
-    /// criteria
-    const size_t number_slices;
-    const size_t id;
-    using executor_t = Executor;
-    Executor_Slice(Aggregated_Executor &parent, const size_t slice_id,
-                   const size_t number_slices)
-        : parent(parent), notify_parent_about_destruction(true),
-          number_slices(number_slices), id(slice_id) {
-  }
-    ~Executor_Slice(void) {
-      // Don't notify parent if we moved away from this executor_slice
-      if (notify_parent_about_destruction) {
-        // Executor should be done by the time of destruction
-        // -> check here before notifying parent
-
-        // parent still in execution mode?
-        assert(parent.slices_exhausted == true);
-        // all kernel launches done?
-        assert(launch_counter == parent.function_calls.size());
-        // Notifiy parent that this aggregation slice is one
-        parent.reduce_usage_counter();
-      }
-    }
-    Executor_Slice(const Executor_Slice &other) = delete;
-    Executor_Slice &operator=(const Executor_Slice &other) = delete;
-    Executor_Slice(Executor_Slice &&other)
-        : parent(other.parent), launch_counter(std::move(other.launch_counter)),
-          buffer_counter(std::move(other.buffer_counter)),
-          number_slices(std::move(other.number_slices)),
-          id(std::move(other.id)) {
-      other.notify_parent_about_destruction = false;
-    }
-    Executor_Slice &operator=(Executor_Slice &&other) {
-      parent = other.parent;
-      launch_counter = std::move(other.launch_counter);
-      buffer_counter = std::move(other.buffer_counter);
-      number_slices = std::move(other.number_slices);
-      id = std::move(other.id);
-      other.notify_parent_about_destruction = false;
-    }
-    template <typename T, typename Host_Allocator>
-    Allocator_Slice<T, Host_Allocator, Executor> make_allocator() {
-      return Allocator_Slice<T, Host_Allocator, Executor>(*this);
-    }
-    bool sync_aggregation_slices() {
-      assert(parent.slices_exhausted == true);
-      auto ret = parent.sync_aggregation_slices(launch_counter);
-      launch_counter++;
-      return ret;
-    }
-    template <typename F, typename... Ts> void post(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      parent.post(launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-    }
-    template <typename F, typename... Ts>
-    hpx::lcos::future<void> async(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      hpx::lcos::future<void> ret_fut = parent.async(
-          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-      return ret_fut;
-    }
-
-    // OneWay Execution
-    template <typename F, typename... Ts>
-    friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t,
-        Executor_Slice& exec, F&& f, Ts&&... ts)
-    {
-        return exec.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-    }
-
-    // TwoWay Execution
-    template <typename F, typename... Ts>
-    friend decltype(auto) tag_invoke(
-        hpx::parallel::execution::async_execute_t, Executor_Slice& exec,
-        F&& f, Ts&&... ts)
-    {
-        return exec.async(
-            std::forward<F>(f), std::forward<Ts>(ts)...);
-    }
+#ifndef AGGREGATION_MANAGER_HPP
+#define AGGREGATION_MANAGER_HPP
 
-    template <typename F, typename... Ts>
-    hpx::lcos::shared_future<void> wrap_async(F &&f, Ts &&...ts) {
-      // we should only execute function calls once all slices
-      // have been given away (-> Executor Slices start)
-      assert(parent.slices_exhausted == true);
-      hpx::lcos::shared_future<void> ret_fut = parent.wrap_async(
-          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
-      launch_counter++;
-      return ret_fut;
-    }
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
-    /// Get new aggregated buffer (might have already been allocated been
-    /// allocated by different slice)
-    template <typename T, typename Host_Allocator> T *get(const size_t size) {
-      assert(parent.slices_exhausted == true);
-      T *aggregated_buffer =
-          parent.get<T, Host_Allocator>(size, buffer_counter);
-      buffer_counter++;
-      assert(buffer_counter > 0);
-      return aggregated_buffer;
-    }
-
-    Executor& get_underlying_executor(void) {
-      assert(parent.executor_wrapper);
-      return *(parent.executor_wrapper);
-    }
-  };
-
-  //===============================================================================
-
-  hpx::lcos::local::promise<void> slices_full_promise;
-  /// Promises with the slice executors -- to be set when the starting criteria
-  /// is met
-  std::vector<hpx::lcos::local::promise<Executor_Slice>> executor_slices;
-  /// List of aggregated function calls - function will be launched when all
-  /// slices have called it
-  std::deque<aggregated_function_call<Executor>> function_calls;
-  /// For synchronizing the access to the function calls list
-  recycler::aggregation_mutex_t mut;
-
-  /// Data entry for a buffer allocation: void* pointer, size_t for
-  /// buffer-size, atomic for the slice counter, location_id, gpu_id
-  using buffer_entry_t =
-      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>;
-  /// Keeps track of the aggregated buffer allocations done in all the slices
-  std::deque<buffer_entry_t> buffer_allocations;
-  /// Map pointer to deque index for fast access in the deallocations
-  std::unordered_map<void*,size_t> buffer_allocations_map;
-  /// For synchronizing the access to the buffer_allocations
-  recycler::aggregation_mutex_t buffer_mut;
-  std::atomic<size_t> buffer_counter = 0;
-
-  /// Get new buffer OR get buffer already allocated by different slice
-  template <typename T, typename Host_Allocator>
-  T *get(const size_t size, const size_t slice_alloc_counter) {
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    assert(executor_slices_alive == true);
-    // Add aggreated buffer entry in case it hasn't happened yet for this call
-    // First: Check if it already has happened
-    if (buffer_counter <= slice_alloc_counter) {
-      // we might be the first! Lock...
-      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
-      // ... and recheck
-      if (buffer_counter <= slice_alloc_counter) {
-        constexpr bool manage_content_lifetime = false;
-        buffers_in_use = true;
-
-        // Default location -- useful for GPU builds as we otherwise create way too
-        // many different buffers for different aggregation sizes on different GPUs
-        /* size_t location_id = gpu_id * instances_per_gpu; */
-        // Use integer conversion to only use 0 16 32 ... as buckets
-        size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; 
-#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
-        if (max_slices == 1) {
-          // get prefered location: aka the current hpx threads location
-          // Usually handy for CPU builds where we want to use the buffers
-          // close to the current CPU core
-          /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
-          /* location_id = (gpu_id) * instances_per_gpu; */
-          // division makes sure that we always use the same instance to store our gpu buffers.
-        }
-#endif
-        // Get shiny and new buffer that will be shared between all slices
-        // Buffer might be recycled from previous allocations by the
-        // buffer_recycler...
-        T *aggregated_buffer =
-            recycler::detail::buffer_recycler::get<T, Host_Allocator>(
-                size, manage_content_lifetime, location_id, gpu_id);
-        // Create buffer entry for this buffer
-        buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
-                                        size, 1, true, location_id, gpu_id);
-
-#ifndef NDEBUG
-        // if previousely used the buffer should not be in usage anymore
-        const auto exists = buffer_allocations_map.count(
-            static_cast<void *>(aggregated_buffer));
-        if (exists > 0) {
-          const auto previous_usage_id =
-              buffer_allocations_map[static_cast<void *>(aggregated_buffer)];
-          const auto &valid =
-              std::get<3>(buffer_allocations[previous_usage_id]);
-          assert(!valid);
-        }
-#endif
-        buffer_allocations_map.insert_or_assign(static_cast<void *>(aggregated_buffer),
-            buffer_counter);
-
-        assert (buffer_counter == slice_alloc_counter);
-        buffer_counter = buffer_allocations.size();
-
-        // Return buffer
-        return aggregated_buffer;
-      }
-    }
-    assert(buffers_in_use == true);
-    assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid
-    assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1);
-
-    // Buffer entry should already exist:
-    T *aggregated_buffer = static_cast<T *>(
-        std::get<0>(buffer_allocations[slice_alloc_counter]));
-    // Error handling: Size is wrong?
-    assert(size == std::get<1>(buffer_allocations[slice_alloc_counter]));
-    // Notify that one more slice has visited this buffer allocation
-    std::get<2>(buffer_allocations[slice_alloc_counter])++;
-    return aggregated_buffer;
-  }
-
-  /// Notify buffer list that one slice is done with the buffer
-  template <typename T, typename Host_Allocator>
-  void mark_unused(T *p, const size_t size) {
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-
-    void *ptr_key = static_cast<void*>(p);
-    size_t slice_alloc_counter = buffer_allocations_map[p];
-
-    assert(slice_alloc_counter < buffer_allocations.size());
-    /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] =
-        buffer_allocations[slice_alloc_counter];*/
-    auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]);
-    const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]);
-    auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]);
-    auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]);
-    const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]);
-    const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]);
-    assert(valid);
-    T *buffer_pointer = static_cast<T *>(buffer_pointer_void);
-
-    assert(buffer_size == size);
-    assert(p == buffer_pointer);
-    // assert(buffer_pointer == p || buffer_pointer == nullptr);
-    // Slice is done with this buffer
-    buffer_allocation_counter--;
-    // Check if all slices are done with this buffer?
-    if (buffer_allocation_counter == 0) {
-      // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
-      std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
-      // Only mark unused if another buffer has not done so already (and marked
-      // it as invalid)
-      if (valid) {
-        assert(buffers_in_use == true);
-        recycler::detail::buffer_recycler::mark_unused<T, Host_Allocator>(
-            buffer_pointer, buffer_size, location_id, gpu_id);
-        // mark buffer as invalid to prevent any other slice from marking the
-        // buffer as unused
-        valid = false;
-
-        const size_t current_deallocs = ++dealloc_counter;
-        if (current_deallocs == buffer_counter) {
-          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-          buffers_in_use = false;
-          if (!executor_slices_alive && !buffers_in_use) {
-            slices_exhausted = false;
-            // Release executor
-            executor_wrapper.reset(nullptr);
-          }
-        }
-      }
-    }
-  }
-
-  //===============================================================================
-  // Public Interface
-public:
-  hpx::lcos::future<void> current_continuation;
-  hpx::lcos::future<void> last_stream_launch_done;
-  std::atomic<size_t> overall_launch_counter = 0;
-
-  /// Only meant to be accessed by the slice executors
-  bool sync_aggregation_slices(const size_t slice_launch_counter) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, false, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].sync_aggregation_slices(
-            last_stream_launch_done);
-      }
-    }
-
-    return function_calls[slice_launch_counter].sync_aggregation_slices(
-        last_stream_launch_done);
-  }
-
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, false, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        function_calls[slice_launch_counter].post_when(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-        return;
-      }
-    }
-
-    function_calls[slice_launch_counter].post_when(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-    return;
-  }
-
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
-                                Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, true, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].async_when(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-      }
-    }
-
-    return function_calls[slice_launch_counter].async_when(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-  /// Only meant to be accessed by the slice executors
-  template <typename F, typename... Ts>
-  hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
-                                Ts &&...ts) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    // Add function call object in case it hasn't happened for this launch yet
-    if (overall_launch_counter <= slice_launch_counter) {
-      /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
-      if (overall_launch_counter <= slice_launch_counter) {
-        function_calls.emplace_back(current_slices, true, *executor_wrapper);
-        overall_launch_counter = function_calls.size();
-        return function_calls[slice_launch_counter].wrap_async(
-            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-      }
-    }
-
-    return function_calls[slice_launch_counter].wrap_async(
-        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  bool slice_available(void) {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    return !slices_exhausted;
-  }
-
-  std::optional<hpx::lcos::future<Executor_Slice>> request_executor_slice() {
-    std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-    if (!slices_exhausted) {
-      const size_t local_slice_id = ++current_slices;
-      if (local_slice_id == 1) {
-        // Cleanup leftovers from last run if any
-        // TODO still required? Should be clean here already
-        function_calls.clear();
-        overall_launch_counter = 0;
-        std::lock_guard<recycler::aggregation_mutex_t> guard(buffer_mut);
-#ifndef NDEBUG
-        for (const auto &buffer_entry : buffer_allocations) {
-          const auto &[buffer_pointer_any, buffer_size,
-                       buffer_allocation_counter, valid, location_id, device_id] =
-              buffer_entry;
-          assert(!valid);
-        }
-#endif 
-        buffer_allocations.clear();
-        buffer_allocations_map.clear();
-        buffer_counter = 0;
-
-        assert(executor_slices_alive == false);
-        assert(buffers_in_use == false);
-        executor_slices_alive = true;
-        buffers_in_use = false;
-        dealloc_counter = 0;
-
-        if (mode == Aggregated_Executor_Modes::STRICT ) {
-          slices_full_promise = hpx::lcos::local::promise<void>{};
-        }
-      }
-
-      // Create Executor Slice future -- that will be returned later
-      hpx::lcos::future<Executor_Slice> ret_fut;
-      if (local_slice_id < max_slices) {
-        executor_slices.emplace_back(hpx::lcos::local::promise<Executor_Slice>{});
-        ret_fut =
-            executor_slices[local_slice_id - 1].get_future();
-      } else {
-        launched_slices = current_slices;
-        ret_fut = hpx::make_ready_future(Executor_Slice{*this,
-            executor_slices.size(), launched_slices});
-      }
-
-      // Are we the first slice? If yes, add continuation set the
-      // Executor_Slice
-      // futures to ready if the launch conditions are met
-      if (local_slice_id == 1) {
-        // Redraw executor
-        assert(!executor_wrapper);
-        stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
-        executor_wrapper.reset(
-            new stream_interface<Executor, round_robin_pool<Executor>>(gpu_id));
-        // Renew promise that all slices will be ready as the primary launch
-        // criteria...
-        hpx::lcos::shared_future<void> fut;
-        if (mode == Aggregated_Executor_Modes::EAGER ||
-            mode == Aggregated_Executor_Modes::ENDLESS) {
-          // Fallback launch condidtion: Launch as soon as the underlying stream
-          // is ready
-          /* auto slices_full_fut = slices_full_promise.get_future(); */
-          stream_pool::select_device<Executor, round_robin_pool<Executor>>(gpu_id);
-          auto exec_fut = (*executor_wrapper).get_future(); 
-          /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */
-          fut = std::move(exec_fut);
-        } else {
-          auto slices_full_fut = slices_full_promise.get_shared_future();
-          // Just use the slices launch condition
-          fut = std::move(slices_full_fut);
-        }
-        // Launch all executor slices within this continuation
-        current_continuation = fut.then([this](auto &&fut) {
-          std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-          slices_exhausted = true;
-          launched_slices = current_slices;
-          size_t id = 0;
-          for (auto &slice_promise : executor_slices) {
-            slice_promise.set_value(
-                Executor_Slice{*this, id, launched_slices});
-            id++;
-          }
-          executor_slices.clear();
-        });
-      }
-      if (local_slice_id >= max_slices &&
-          mode != Aggregated_Executor_Modes::ENDLESS) {
-        slices_exhausted = true; // prevents any more threads from entering
-                                 // before the continuation is launched
-        /* launched_slices = current_slices; */
-        /* size_t id = 0; */
-        /* for (auto &slice_promise : executor_slices) { */
-        /*   slice_promise.set_value( */
-        /*       Executor_Slice{*this, id, launched_slices}); */
-        /*   id++; */
-        /* } */
-        /* executor_slices.clear(); */
-        if (mode == Aggregated_Executor_Modes::STRICT ) {
-          slices_full_promise.set_value(); // Trigger slices launch condition continuation 
-        }
-        // that continuation will set all executor slices so far handed out to ready
-      }
-      return ret_fut;
-    } else {
-      // Return empty optional as failure
-      return std::optional<hpx::lcos::future<Executor_Slice>>{};
-    }
-  }
-  size_t launched_slices;
-  void reduce_usage_counter(void) {
-    /* std::lock_guard<recycler::aggregation_mutex_t> guard(mut); */
-    assert(slices_exhausted == true);
-    assert(executor_wrapper);
-    assert(executor_slices_alive == true);
-    assert(launched_slices >= 1);
-    assert(current_slices >= 0 && current_slices <= launched_slices);
-    const size_t local_slice_id = --current_slices;
-    // Last slice goes out scope?
-    if (local_slice_id == 0) {
-      // Mark executor fit for reusage
-      std::lock_guard<recycler::aggregation_mutex_t> guard(mut);
-      executor_slices_alive = false; 
-      if (!executor_slices_alive && !buffers_in_use) {
-        // Release executor
-        slices_exhausted = false;
-        executor_wrapper.reset(nullptr);
-      }
-    }
-  }
-  ~Aggregated_Executor(void) {
-
-    assert(current_slices == 0);
-    assert(executor_slices_alive == false);
-    assert(buffers_in_use == false);
-
-    if (mode != Aggregated_Executor_Modes::STRICT ) {
-        slices_full_promise.set_value(); // Trigger slices launch condition continuation 
-    }
-
-    // Cleanup leftovers from last run if any
-    function_calls.clear();
-    overall_launch_counter = 0;
-#ifndef NDEBUG
-    for (const auto &buffer_entry : buffer_allocations) {
-      const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter,
-                   valid, location_id, device_id] = buffer_entry;
-      assert(!valid);
-    }
-#endif
-    buffer_allocations.clear();
-    buffer_allocations_map.clear();
-    buffer_counter = 0;
-
-    assert(buffer_allocations.empty());
-    assert(buffer_allocations_map.empty());
-  }
-
-  Aggregated_Executor(const size_t number_slices,
-                      Aggregated_Executor_Modes mode, const size_t gpu_id = 0)
-      : max_slices(number_slices), current_slices(0), slices_exhausted(false),
-        dealloc_counter(0), mode(mode), executor_slices_alive(false),
-        buffers_in_use(false), gpu_id(gpu_id),
-        executor_wrapper(nullptr),
-        current_continuation(hpx::make_ready_future()),
-        last_stream_launch_done(hpx::make_ready_future()) {}
-  // Not meant to be copied or moved
-  Aggregated_Executor(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete;
-  Aggregated_Executor(Aggregated_Executor &&other) = delete;
-  Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete;
-};
+using Aggregated_Executor_Modes
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor_modes "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+        cppuddle::kernel_aggregation::aggregated_executor_modes;
 
 template <typename T, typename Host_Allocator, typename Executor>
-class Allocator_Slice {
-private:
-  typename Aggregated_Executor<Executor>::Executor_Slice &executor_reference;
-  Aggregated_Executor<Executor> &executor_parent;
-
-public:
-  using value_type = T;
-  Allocator_Slice(
-      typename Aggregated_Executor<Executor>::Executor_Slice &executor)
-      : executor_reference(executor), executor_parent(executor.parent) {}
-  template <typename U>
-  explicit Allocator_Slice(
-      Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data = executor_reference.template get<T, Host_Allocator>(n);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    /* executor_reference.template mark_unused<T, Host_Allocator>(p, n); */
-    executor_parent.template mark_unused<T, Host_Allocator>(p, n);
-  }
-  template <typename... Args>
-  inline void construct(T *p, Args... args) noexcept {
-    // Do nothing here - we reuse the content of the last owner
-  }
-  void destroy(T *p) {
-    // Do nothing here - Contents will be destroyed when the buffer manager is
-    // destroyed, not before
-  }
-};
-template <typename T, typename U, typename Host_Allocator, typename Executor>
-constexpr bool
-operator==(Allocator_Slice<T, Host_Allocator, Executor> const &,
-           Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {
-  return false;
-}
-template <typename T, typename U, typename Host_Allocator, typename Executor>
-constexpr bool
-operator!=(Allocator_Slice<T, Host_Allocator, Executor> const &,
-           Allocator_Slice<U, Host_Allocator, Executor> const &) noexcept {
-  return true;
-}
-
-namespace hpx { namespace parallel { namespace execution {
-   // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with 
-   // compatible unlying executor types
-    /* template<typename E> */
-    /* struct is_one_way_executor<typename Aggregated_Executor<E>::Executor_Slice> */
-    /*   : std::true_type */
-    /* {}; */
-    /* template<typename E> */
-    /* struct is_two_way_executor<typename Aggregated_Executor<E>::Executor_Slice> */
-    /*   : std::true_type */
-    /* {}; */
-
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-    // Workaround for the meantime: Manually create traits for compatible types:
-    template<>
-    struct is_one_way_executor<typename Aggregated_Executor<hpx::cuda::experimental::cuda_executor>::Executor_Slice>
-      : std::true_type
-    {};
-    template<>
-    struct is_two_way_executor<typename Aggregated_Executor<hpx::cuda::experimental::cuda_executor>::Executor_Slice>
-      : std::true_type
-    {};
-#endif
-}}}
+using Allocator_Slice
+    [[deprecated("Use cppuddle::kernel_aggregation::allocator_slice "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+    cppuddle::kernel_aggregation::allocator_slice<T, Host_Allocator, Executor>;
 
-//===============================================================================
-//===============================================================================
-// Pool Strategy:
+template <typename Executor>
+using Aggregated_Executor
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+    cppuddle::kernel_aggregation::aggregated_executor<Executor>;
 
 template <const char *kernelname, class Interface, class Pool>
-class aggregation_pool {
-public:
-  /// interface
-  template <typename... Ts>
-  static void init(size_t number_of_executors, size_t slices_per_executor,
-                   Aggregated_Executor_Modes mode, size_t num_devices = 1) {
-    if (is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to initialize cppuddle aggregation pool twice") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    if (num_devices > recycler::max_number_gpus) {
-      throw std::runtime_error(
-          std::string(
-              "Trying to initialize aggregation with more devices than the "
-              "maximum number of GPUs given at compiletime") +
-          " Agg pool name: " + std::string(kernelname));
-    }
-    number_devices = num_devices;
-    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
-
-      std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-      assert(instance()[gpu_id].aggregation_executor_pool.empty());
-      for (int i = 0; i < number_of_executors; i++) {
-        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
-                                                        mode, gpu_id);
-      }
-      instance()[gpu_id].slices_per_executor = slices_per_executor;
-      instance()[gpu_id].mode = mode;
-    }
-    is_initialized = true;
-  }
-
-  /// Will always return a valid executor slice
-  static decltype(auto) request_executor_slice(void) {
-    if (!is_initialized) {
-      throw std::runtime_error(
-          std::string("Trying to use cppuddle aggregation pool without first calling init") +
-          " Agg poolname: " + std::string(kernelname));
-    }
-    const size_t gpu_id = recycler::get_device_id(number_devices);
-    /* const size_t gpu_id = 1; */
-    std::lock_guard<recycler::aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
-    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
-    std::optional<hpx::lcos::future<
-        typename Aggregated_Executor<Interface>::Executor_Slice>>
-        ret;
-    size_t local_id = (instance()[gpu_id].current_interface) %
-                      instance()[gpu_id].aggregation_executor_pool.size();
-    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-    // Expected case: current aggregation executor is free
-    if (ret.has_value()) {
-      return ret;
-    }
-    // current interface is bad -> find free one
-    size_t abort_counter = 0;
-    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
-    do {
-      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
-                 instance()[gpu_id].aggregation_executor_pool.size();
-      ret =
-          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
-      if (ret.has_value()) {
-        return ret;
-      }
-      abort_counter++;
-    } while (abort_counter <= abort_number);
-    // Everything's busy -> create new aggregation executor (growing pool) OR
-    // return empty optional
-    if (instance()[gpu_id].growing_pool) {
-      instance()[gpu_id].aggregation_executor_pool.emplace_back(
-          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
-      instance()[gpu_id].current_interface =
-          instance()[gpu_id].aggregation_executor_pool.size() - 1;
-      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
-      ret = instance()[gpu_id]
-                .aggregation_executor_pool[instance()[gpu_id].current_interface]
-                .request_executor_slice();
-      assert(ret.has_value()); // fresh executor -- should always have slices
-                               // available
-    }
-    return ret;
-  }
-
-private:
-  std::deque<Aggregated_Executor<Interface>> aggregation_executor_pool;
-  std::atomic<size_t> current_interface{0};
-  size_t slices_per_executor;
-  Aggregated_Executor_Modes mode;
-  bool growing_pool{true};
-
-private:
-  /// Required for dealing with adding elements to the deque of
-  /// aggregated_executors
-  recycler::aggregation_mutex_t pool_mutex;
-  /// Global access instance
-  static std::unique_ptr<aggregation_pool[]>& instance(void) {
-    static std::unique_ptr<aggregation_pool[]> pool_instances{
-        new aggregation_pool[recycler::max_number_gpus]};
-    return pool_instances;
-  }
-  static inline size_t number_devices = 1;
-  static inline bool is_initialized = false;
-  aggregation_pool() = default;
-
-public:
-  ~aggregation_pool() = default;
-  // Bunch of constructors we don't need
-  aggregation_pool(aggregation_pool const &other) = delete;
-  aggregation_pool &operator=(aggregation_pool const &other) = delete;
-  aggregation_pool(aggregation_pool &&other) = delete;
-  aggregation_pool &operator=(aggregation_pool &&other) = delete;
-};
+using aggregation_pool
+    [[deprecated("Use cppuddle::kernel_aggregation::aggregation_pool "
+                 "from kernel_aggregation_interface.hpp instead")]] =
+    cppuddle::kernel_aggregation::aggregation_pool<kernelname, Interface,
+    Pool>;
 
 #endif
diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp
index d36a994a..64497a9d 100644
--- a/include/aligned_buffer_util.hpp
+++ b/include/aligned_buffer_util.hpp
@@ -1,31 +1,31 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef ALIGNED_BUFFER_UTIL_HPP
 #define ALIGNED_BUFFER_UTIL_HPP
 
-#include "buffer_manager.hpp"
-#include <boost/align/aligned_allocator.hpp>
+#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp"
 
 namespace recycler {
-namespace device_selection {
-template <typename T, size_t alignement>
-struct select_device_functor<
-    T, boost::alignment::aligned_allocator<T, alignement>> {
-  void operator()(const size_t device_id) {}
-};
-} // namespace device_selection
 
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_aligned = detail::recycle_allocator<
-    T, boost::alignment::aligned_allocator<T, alignement>>;
+using recycle_aligned
+    [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_aligned<T, alignement>;
+
 template <typename T, std::size_t alignement,
           std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using aggressive_recycle_aligned = detail::aggressive_recycle_allocator<
-    T, boost::alignment::aligned_allocator<T, alignement>>;
+using aggressive_recycle_aligned
+    [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::aggressive_recycle_aligned<T, alignement>;
+
 } // namespace recycler
 
 #endif
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
index 92a5f46b..69020e5b 100644
--- a/include/buffer_manager.hpp
+++ b/include/buffer_manager.hpp
@@ -1,938 +1,60 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef BUFFER_MANAGER_HPP
 #define BUFFER_MANAGER_HPP
 
-#include <atomic>
-#include <cassert>
-#include <functional>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <stdexcept>
-#include <type_traits>
-#include <unordered_map>
-
-// Warn about suboptimal performance without correct HPX-aware allocators
-#ifdef CPPUDDLE_HAVE_HPX
-#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
-#pragma message                                                                \
-"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
-For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
-#else
-// include runtime to get HPX thread IDs required for the HPX-aware allocators
-#include <hpx/include/runtime.hpp>
-#endif
-#endif
-
-#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
-// For builds with The HPX mutex
-#include <hpx/mutex.hpp>
-#endif
-
-#ifdef CPPUDDLE_HAVE_COUNTERS
-#include <boost/core/demangle.hpp>
-#if defined(CPPUDDLE_HAVE_HPX)
-#include <hpx/include/performance_counters.hpp>
-#endif
-#endif
-
-#include "../include/detail/config.hpp"
+#include "cppuddle/common/config.hpp"
+#include "cppuddle/memory_recycling/buffer_management_interface.hpp"
+#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 namespace recycler {
 
-namespace device_selection {
-template <typename T, typename Allocator> struct select_device_functor {
-  void operator()(const size_t device_id) {
-    if constexpr (max_number_gpus > 1)
-      throw std::runtime_error(
-          "Allocators used in Multi-GPU builds need explicit Multi-GPU support "
-          "(by having a select_device_functor overload");
-  }
-};
-template <typename T> struct select_device_functor<T, std::allocator<T>> {
-  void operator()(const size_t device_id) {}
-};
-} // namespace device_selection
-
 namespace detail {
-
-
-class buffer_recycler {
-public:
-#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
-
-// Warn about suboptimal performance without recycling
-#pragma message                                                                \
-"Warning: Building without buffer recycling! Use only for performance testing! \
-For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
-
-  template <typename T, typename Host_Allocator>
-  static T *get(size_t number_elements, bool manage_content_lifetime = false,
-      std::optional<size_t> location_hint = std::nullopt, 
-      std::optional<size_t> device_id = std::nullopt) {
-
-    return Host_Allocator{}.allocate(number_elements);
-  }
-  /// Marks an buffer as unused and fit for reusage
-  template <typename T, typename Host_Allocator>
-  static void mark_unused(T *p, size_t number_elements,
-      std::optional<size_t> location_hint = std::nullopt,
-      std::optional<size_t> device_id = std::nullopt) {
-    return Host_Allocator{}.deallocate(p, number_elements);
-  }
-#else
-  /// Returns and allocated buffer of the requested size - this may be a reused
-  /// buffer
-  template <typename T, typename Host_Allocator>
-  static T *get(size_t number_elements, bool manage_content_lifetime = false,
-      std::optional<size_t> location_hint = std::nullopt, 
-      std::optional<size_t> device_id = std::nullopt) {
-    try {
-      return buffer_manager<T, Host_Allocator>::get(
-          number_elements, manage_content_lifetime, location_hint, device_id);
-    } catch (const std::exception &exc) {
-      std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
-      std::cerr << "Rethrowing exception... " << std::endl;;
-      throw;
-    }
-  }
-  /// Marks an buffer as unused and fit for reusage
-  template <typename T, typename Host_Allocator>
-  static void mark_unused(T *p, size_t number_elements,
-      std::optional<size_t> location_hint = std::nullopt, 
-      std::optional<size_t> device_id = std::nullopt) {
-    try {
-      return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
-          location_hint, device_id);
-    } catch (const std::exception &exc) {
-      std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
-      std::cerr << "Rethrowing exception... " << std::endl;;
-      throw;
-    }
-  }
-#endif
-  template <typename T, typename Host_Allocator>
-    static void register_allocator_counters_with_hpx(void) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
-#else
-    std::cerr << "Warning: Trying to register allocator performance counters "
-                 "with HPX but CPPuddle was built "
-                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
-              << std::endl;
-#endif
-  }
-
-  /// Deallocate all buffers, no matter whether they are marked as used or not
-  static void clean_all() {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    for (const auto &clean_function :
-         instance().total_cleanup_callbacks) {
-      clean_function();
-    }
-  }
-  /// Deallocated all currently unused buffer
-  static void clean_unused_buffers() {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    for (const auto &clean_function :
-         instance().partial_cleanup_callbacks) {
-      clean_function();
-    }
-  }
-  /// Deallocate all buffers, no matter whether they are marked as used or not
-  static void finalize() {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    for (const auto &finalize_function :
-         instance().finalize_callbacks) {
-      finalize_function();
-    }
-  }
-
-  static void print_performance_counters() {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    for (const auto &print_function :
-         instance().print_callbacks) {
-      print_function();
-    }
-#else
-    std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built "
-                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
-              << std::endl;
-#endif
-  }
-
-  // Member variables and methods
-private:
-
-  /// Singleton instance access
-  static buffer_recycler& instance() {
-    static buffer_recycler singleton{};
-    return singleton;
-  }
-  /// Callbacks for printing the performance counter data
-  std::list<std::function<void()>> print_callbacks;
-  /// Callbacks for buffer_manager finalize - each callback completely destroys
-  /// one buffer_manager
-  std::list<std::function<void()>> finalize_callbacks;
-  /// Callbacks for buffer_manager cleanups - each callback destroys all buffers within 
-  /// one buffer_manager, both used and unsued
-  std::list<std::function<void()>> total_cleanup_callbacks;
-  /// Callbacks for partial buffer_manager cleanups - each callback deallocates
-  /// all unused buffers of a manager
-  std::list<std::function<void()>> partial_cleanup_callbacks;
-  /// default, private constructor - not automatically constructed due to the
-  /// deleted constructors
-  buffer_recycler() = default;
-
-  mutex_t callback_protection_mut;
-  /// Add a callback function that gets executed upon cleanup and destruction
-  static void add_total_cleanup_callback(const std::function<void()> &func) {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    instance().total_cleanup_callbacks.push_back(func);
-  }
-  /// Add a callback function that gets executed upon partial (unused memory)
-  /// cleanup
-  static void add_partial_cleanup_callback(const std::function<void()> &func) {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    instance().partial_cleanup_callbacks.push_back(func);
-  }
-  /// Add a callback function that gets executed upon partial (unused memory)
-  /// cleanup
-  static void add_finalize_callback(const std::function<void()> &func) {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    instance().finalize_callbacks.push_back(func);
-  }
-  /// Add a callback function that gets executed upon partial (unused memory)
-  /// cleanup
-  static void add_print_callback(const std::function<void()> &func) {
-    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
-    instance().print_callbacks.push_back(func);
-  }
-
-public:
-  ~buffer_recycler() = default; 
-
-  // Subclasses
-private:
-  /// Memory Manager subclass to handle buffers a specific type
-  template <typename T, typename Host_Allocator> class buffer_manager {
-  private:
-    // Tuple content: Pointer to buffer, buffer_size, location ID, Flag
-    // The flag at the end controls whether to buffer content is to be reused as
-    // well
-    using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
-
-
-  public:
-    /// Cleanup and delete this singleton
-    static void clean() {
-      assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
-        std::lock_guard<mutex_t> guard(instance()[i].mut);
-        instance()[i].clean_all_buffers();
-      }
-    }
-    static void print_performance_counters() {
-      assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
-        std::lock_guard<mutex_t> guard(instance()[i].mut);
-        instance()[i].print_counters();
-      }
-    }
-    static void finalize() {
-      assert(instance() && !is_finalized);
-      is_finalized = true;
-      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
-        std::lock_guard<mutex_t> guard(instance()[i].mut);
-        instance()[i].clean_all_buffers();
-      }
-      instance().reset();
-    }
-    /// Cleanup all buffers not currently in use
-    static void clean_unused_buffers_only() {
-      assert(instance() && !is_finalized);
-      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
-        std::lock_guard<mutex_t> guard(instance()[i].mut);
-        for (auto &buffer_tuple : instance()[i].unused_buffer_list) {
-          Host_Allocator alloc;
-          if (std::get<3>(buffer_tuple)) {
-            std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-          }
-          alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-        }
-        instance()[i].unused_buffer_list.clear();
-      }
-    }
-#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
-    static size_t get_sum_number_recycling(bool reset) {
-      if (reset)
-        sum_number_recycling = 0;
-      return sum_number_recycling;
-    }
-    static size_t get_sum_number_allocation(bool reset) {
-      if (reset)
-        sum_number_allocation = 0;
-      return sum_number_allocation;
-    }
-    static size_t get_sum_number_creation(bool reset) {
-      if (reset)
-        sum_number_creation = 0;
-      return sum_number_creation;
-    }
-    static size_t get_sum_number_deallocation(bool reset) {
-      if (reset)
-        sum_number_deallocation = 0;
-      return sum_number_deallocation;
-    }
-    static size_t get_sum_number_wrong_hints(bool reset) {
-      if (reset)
-        sum_number_wrong_hints = 0;
-      return sum_number_wrong_hints;
-    }
-    static size_t get_sum_number_wrong_device_hints(bool reset) {
-      if (reset)
-        sum_number_wrong_hints = 0;
-      return sum_number_wrong_device_hints;
-    }
-    static size_t get_sum_number_bad_allocs(bool reset) {
-      if (reset)
-        sum_number_bad_allocs = 0;
-      return sum_number_bad_allocs;
-    }
-
-    static void register_counters_with_hpx(void) {
-      std::string alloc_name =
-          boost::core::demangle(typeid(Host_Allocator).name()) +
-          std::string("_") + boost::core::demangle(typeid(T).name());
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"),
-          &get_sum_number_recycling,
-          "Number of allocations using a recycled buffer with this "
-          "allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"),
-          &get_sum_number_allocation,
-          "Number of allocations with this allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"),
-          &get_sum_number_creation,
-          "Number of allocations not using a recycled buffer with this "
-          "allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"),
-          &get_sum_number_deallocation,
-          "Number of deallocations yielding buffers to be recycled with this "
-          "allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
-          &get_sum_number_wrong_hints,
-          "Number of wrong hints supplied to the dealloc method with this allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"),
-          &get_sum_number_wrong_device_hints,
-          "Number of wrong device hints supplied to the dealloc method with this allocator");
-      hpx::performance_counters::install_counter_type(
-          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
-          &get_sum_number_bad_allocs,
-          "Number of wrong bad allocs which triggered a cleanup of unused buffers");
-    }
-#endif
-
-    /// Tries to recycle or create a buffer of type T and size number_elements.
-    static T *get(size_t number_of_elements, bool manage_content_lifetime,
-        std::optional<size_t> location_hint = std::nullopt,
-        std::optional<size_t> gpu_device_id = std::nullopt) {
-      init_callbacks_once();
-      if (is_finalized) {
-        throw std::runtime_error("Tried allocation after finalization");
-      }
-      assert(instance() && !is_finalized);
-
-      size_t location_id = 0;
-      if (location_hint) {
-        location_id = *location_hint;
-      }
-      if (location_id >= number_instances) {
-        throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
-      }
-      size_t device_id = 0;
-      if (gpu_device_id) {
-        device_id = *gpu_device_id;
-      }
-      if (device_id >= max_number_gpus) {
-        throw std::runtime_error("Tried to create buffer with invalid device id [get]! "
-                                 "Is multigpu support enabled with the correct number "
-                                 "of GPUs?");
-      }
-
-      location_id = location_id + device_id * number_instances;
-      std::lock_guard<mutex_t> guard(instance()[location_id].mut);
-
-
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      instance()[location_id].number_allocation++;
-      sum_number_allocation++;
-#endif
-      // Check for unused buffers we can recycle:
-      for (auto iter = instance()[location_id].unused_buffer_list.begin();
-           iter != instance()[location_id].unused_buffer_list.end(); iter++) {
-        auto tuple = *iter;
-        if (std::get<1>(tuple) == number_of_elements) {
-          instance()[location_id].unused_buffer_list.erase(iter);
-
-          // handle the switch from aggressive to non aggressive reusage (or
-          // vice-versa)
-          if (manage_content_lifetime && !std::get<3>(tuple)) {
-            std::uninitialized_value_construct_n(std::get<0>(tuple),
-                                                  number_of_elements);
-            std::get<3>(tuple) = true;
-          } else if (!manage_content_lifetime && std::get<3>(tuple)) {
-            std::destroy_n(std::get<0>(tuple), std::get<1>(tuple));
-            std::get<3>(tuple) = false;
-          }
-          instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
-#ifdef CPPUDDLE_HAVE_COUNTERS
-          instance()[location_id].number_recycling++;
-          sum_number_recycling++;
-#endif
-          return std::get<0>(tuple);
-        }
-      }
-
-      // No unused buffer found -> Create new one and return it
-      try {
-        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
-            device_id);
-        Host_Allocator alloc;
-        T *buffer = alloc.allocate(number_of_elements);
-        instance()[location_id].buffer_map.insert(
-            {buffer, std::make_tuple(buffer, number_of_elements, 1,
-                                     manage_content_lifetime)});
-#ifdef CPPUDDLE_HAVE_COUNTERS
-        instance()[location_id].number_creation++;
-        sum_number_creation++;
-#endif
-        if (manage_content_lifetime) {
-          std::uninitialized_value_construct_n(buffer, number_of_elements);
-        }
-        return buffer;
-      } catch (std::bad_alloc &e) {
-        // not enough memory left! Cleanup and attempt again:
-        std::cerr 
-          << "Not enough memory left. Cleaning up unused buffers now..." 
-          << std::endl;
-        buffer_recycler::clean_unused_buffers();
-        std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
-
-        // If there still isn't enough memory left, the caller has to handle it
-        // We've done all we can in here
-        Host_Allocator alloc;
-        recycler::device_selection::select_device_functor<T, Host_Allocator>{}(
-            device_id);
-        T *buffer = alloc.allocate(number_of_elements);
-        instance()[location_id].buffer_map.insert(
-            {buffer, std::make_tuple(buffer, number_of_elements, 1,
-                                     manage_content_lifetime)});
-#ifdef CPPUDDLE_HAVE_COUNTERS
-        instance()[location_id].number_creation++;
-        sum_number_creation++;
-        instance()[location_id].number_bad_alloc++;
-        sum_number_bad_allocs++;
-#endif
-        std::cerr << "Second attempt allocation successful!" << std::endl;
-        if (manage_content_lifetime) {
-          std::uninitialized_value_construct_n(buffer, number_of_elements);
-        }
-        return buffer;
-      }
-    }
-
-    static void mark_unused(T *memory_location, size_t number_of_elements,
-        std::optional<size_t> location_hint = std::nullopt,
-        std::optional<size_t> device_hint = std::nullopt) {
-      if (is_finalized)
-        return;
-      assert(instance() && !is_finalized);
-
-      size_t location_id = 0;
-      if (location_hint) {
-        location_id = *location_hint;
-        if (location_id >= number_instances) {
-          throw std::runtime_error(
-              "Buffer recylcer received invalid location hint [mark_unused]");
-        }
-      }
-      size_t device_id = 0;
-      if (device_hint) {
-        device_id = *device_hint;
-        if (device_id >= max_number_gpus) {
-          throw std::runtime_error(
-              "Buffer recylcer received invalid devce hint [mark_unused]");
-        }
-      }
-
-      // Attempt 1 to find the correct bucket/location: Look at provided hint:
-      if (location_hint) {
-        size_t location_id = location_hint.value() + device_id * number_instances;
-        std::lock_guard<mutex_t> guard(instance()[location_id].mut);
-        if (instance()[location_id].buffer_map.find(memory_location) !=
-            instance()[location_id].buffer_map.end()) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-          instance()[location_id].number_deallocation++;
-          sum_number_deallocation++;
-#endif
-          auto it = instance()[location_id].buffer_map.find(memory_location);
-          assert(it != instance()[location_id].buffer_map.end());
-          auto &tuple = it->second;
-          // sanity checks:
-          assert(std::get<1>(tuple) == number_of_elements);
-          // move to the unused_buffer list
-          instance()[location_id].unused_buffer_list.push_front(tuple);
-          instance()[location_id].buffer_map.erase(memory_location);
-          return; // Success
-        }
-        // hint was wrong 
-#ifdef CPPUDDLE_HAVE_COUNTERS
-        instance()[location_id].number_wrong_hints++;
-        sum_number_wrong_hints++;
-#endif
-      }
-      // Failed to find buffer in the specified localtion/device!
-      // Attempt 2 - Look for buffer other locations on the same device...
-      for (size_t location_id = device_id * number_instances;
-           location_id < (device_id + 1) * number_instances; location_id++) {
-        if (location_hint) {
-          if (*location_hint + device_id * max_number_gpus  == location_id) {
-            continue; // already tried this -> skip
-          }
-        }
-        std::lock_guard<mutex_t> guard(instance()[location_id].mut);
-        if (instance()[location_id].buffer_map.find(memory_location) !=
-            instance()[location_id].buffer_map.end()) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-          instance()[location_id].number_deallocation++;
-          sum_number_deallocation++;
-#endif
-          auto it = instance()[location_id].buffer_map.find(memory_location);
-          assert(it != instance()[location_id].buffer_map.end());
-          auto &tuple = it->second;
-          // sanity checks:
-          assert(std::get<1>(tuple) == number_of_elements);
-          // move to the unused_buffer list
-          instance()[location_id].unused_buffer_list.push_front(tuple);
-          instance()[location_id].buffer_map.erase(memory_location);
-          return; // Success
-        }
-      }
-      // device hint was wrong 
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      if (device_hint) {
-        sum_number_wrong_device_hints++;
-      }
-#endif
-      // Failed to find buffer on the specified device!
-      // Attempt 3 - Look for buffer on other devices...
-      for (size_t local_device_id = 0; local_device_id < max_number_gpus;
-           local_device_id++) {
-        if (local_device_id == device_id)
-          continue; // aldready tried this device
-
-        // Try hint localtion first yet again (though on different device)
-        if (location_hint) {
-          size_t location_id = location_hint.value() + local_device_id * number_instances;
-          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
-          if (instance()[location_id].buffer_map.find(memory_location) !=
-              instance()[location_id].buffer_map.end()) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-            instance()[location_id].number_deallocation++;
-            sum_number_deallocation++;
-#endif
-            auto it = instance()[location_id].buffer_map.find(memory_location);
-            assert(it != instance()[location_id].buffer_map.end());
-            auto &tuple = it->second;
-            // sanity checks:
-            assert(std::get<1>(tuple) == number_of_elements);
-            // move to the unused_buffer list
-            instance()[location_id].unused_buffer_list.push_front(tuple);
-            instance()[location_id].buffer_map.erase(memory_location);
-            return; // Success
-          }
-        }
-        // Failed - check all other localtions on device
-        for (size_t location_id = local_device_id * number_instances;
-             location_id < (local_device_id + 1) * number_instances; location_id++) {
-          if (location_hint) {
-            if (*location_hint + local_device_id * max_number_gpus == location_id) {
-              continue; // already tried this -> skip
-            }
-          }
-          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
-          if (instance()[location_id].buffer_map.find(memory_location) !=
-              instance()[location_id].buffer_map.end()) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-            instance()[location_id].number_deallocation++;
-            sum_number_deallocation++;
-#endif
-            auto it = instance()[location_id].buffer_map.find(memory_location);
-            assert(it != instance()[location_id].buffer_map.end());
-            auto &tuple = it->second;
-            // sanity checks:
-            assert(std::get<1>(tuple) == number_of_elements);
-            // move to the unused_buffer list
-            instance()[location_id].unused_buffer_list.push_front(tuple);
-            instance()[location_id].buffer_map.erase(memory_location);
-            return; // Success
-          }
-        }
-      }
-      // Buffer that is to be deleted is nowhere to be found - we looked everywhere!
-      // =>
-      // Failure! Handle here...
-
-      // TODO Throw exception instead in the futures, as soon as the recycler finalize is 
-      // in all user codes
-      /* throw std::runtime_error("Tried to delete non-existing buffer"); */
-
-      // This is odd: Print warning -- however, might also happen with static
-      // buffers using these allocators IF the new finalize was not called. For
-      // now, print warning until all user-code is upgraded to the finalize method.
-      // This allows using current versions of cppuddle with older application code
-      std::cerr
-          << "Warning! Tried to delete non-existing buffer within CPPuddle!"
-          << std::endl;
-      std::cerr << "Did you forget to call recycler::finalize?" << std::endl;
-      }
-
-  private:
-    /// List with all buffers still in usage
-    std::unordered_map<T *, buffer_entry_type> buffer_map{};
-    /// List with all buffers currently not used
-    std::list<buffer_entry_type> unused_buffer_list{};
-    /// Access control
-    mutex_t mut;
-#ifdef CPPUDDLE_HAVE_COUNTERS
-    /// Performance counters
-    size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
-        number_recycling{0}, number_creation{0}, number_bad_alloc{0};
-
-    static inline std::atomic<size_t> sum_number_allocation{0},
-        sum_number_deallocation{0}, sum_number_wrong_hints{0},
-        sum_number_wrong_device_hints{0}, sum_number_recycling{0},
-        sum_number_creation{0}, sum_number_bad_allocs{0};
-#endif
-        /// default, private constructor - not automatically constructed due to
-        /// the deleted constructors
-        buffer_manager() = default;
-    buffer_manager&
-    operator=(buffer_manager<T, Host_Allocator> const &other) = default;
-    buffer_manager&
-    operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
-    static std::unique_ptr<buffer_manager[]>& instance(void) {
-      static std::unique_ptr<buffer_manager[]> instances{
-          new buffer_manager[number_instances * max_number_gpus]};
-      return instances;
-    }
-    static void init_callbacks_once(void) {
-      assert(instance());
-#if defined(CPPUDDLE_HAVE_HPX)  && defined(CPPUDDLE_HAVE_HPX_MUTEX)
-      static hpx::once_flag flag; 
-      hpx::call_once(flag, []() {
-#else
-      static std::once_flag flag; 
-      std::call_once(flag, []() {
-#endif
-        is_finalized = false;
-        buffer_recycler::add_total_cleanup_callback(clean);
-        buffer_recycler::add_partial_cleanup_callback(
-            clean_unused_buffers_only);
-        buffer_recycler::add_finalize_callback(
-            finalize);
-#ifdef CPPUDDLE_HAVE_COUNTERS
-        buffer_recycler::add_print_callback(
-            print_performance_counters);
-#endif
-          });
-    }
-    static inline std::atomic<bool> is_finalized;
-
-#ifdef CPPUDDLE_HAVE_COUNTERS
-    void print_counters(void) {
-      if (number_allocation == 0)
-        return;
-      // Print performance counters
-      size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
-      std::cout << "\nBuffer manager destructor for (Alloc: "
-                << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: "
-                << boost::core::demangle(typeid(T).name())
-                << "):" << std::endl
-                << "--------------------------------------------------------------------"
-                << std::endl
-                << "--> Number of bad_allocs that triggered garbage "
-                   "collection:       "
-                << number_bad_alloc << std::endl
-                << "--> Number of buffers that got requested from this "
-                   "manager:       "
-                << number_allocation << std::endl
-                << "--> Number of times an unused buffer got recycled for a "
-                   "request:  "
-                << number_recycling << std::endl
-                << "--> Number of times a new buffer had to be created for a "
-                   "request: "
-                << number_creation << std::endl
-                << "--> Number cleaned up buffers:                             "
-                   "       "
-                << number_cleaned << std::endl
-                << "--> Number wrong deallocation hints:                       "
-                   "       "
-                << number_wrong_hints << std::endl
-                << "--> Number of buffers that were marked as used upon "
-                   "cleanup:      "
-                << buffer_map.size() << std::endl
-                << "==> Recycle rate:                                          "
-                   "       "
-                << static_cast<float>(number_recycling) / number_allocation *
-                       100.0f
-                << "%" << std::endl;
-    }
-#endif
-
-    void clean_all_buffers(void) {
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      if (number_allocation == 0 && number_recycling == 0 &&
-          number_bad_alloc == 0 && number_creation == 0 &&
-          unused_buffer_list.empty() && buffer_map.empty()) {
-        return;
-      }
-#endif
-      for (auto &buffer_tuple : unused_buffer_list) {
-        Host_Allocator alloc;
-        if (std::get<3>(buffer_tuple)) {
-          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-        }
-        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-      }
-      for (auto &map_tuple : buffer_map) {
-        auto buffer_tuple = map_tuple.second;
-        Host_Allocator alloc;
-        if (std::get<3>(buffer_tuple)) {
-          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-        }
-        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
-      }
-      unused_buffer_list.clear();
-      buffer_map.clear();
-#ifdef CPPUDDLE_HAVE_COUNTERS
-      number_allocation = 0;
-      number_recycling = 0;
-      number_bad_alloc = 0;
-      number_creation = 0;
-      number_wrong_hints = 0;
-#endif
-    }
-  public:
-    ~buffer_manager() {
-      clean_all_buffers();
-    }
-
-  public: // Putting deleted constructors in public gives more useful error
-          // messages
-    // Bunch of constructors we don't need
-    buffer_manager(
-        buffer_manager<T, Host_Allocator> const &other) = delete;
-    buffer_manager(
-        buffer_manager<T, Host_Allocator> &&other) = delete;
-  };
-
-public:
-  // Putting deleted constructors in public gives more useful error messages
-  // Bunch of constructors we don't need
-  buffer_recycler(buffer_recycler const &other) = delete;
-  buffer_recycler& operator=(buffer_recycler const &other) = delete;
-  buffer_recycler(buffer_recycler &&other) = delete;
-  buffer_recycler& operator=(buffer_recycler &&other) = delete;
-};
-
-template <typename T, typename Host_Allocator> struct recycle_allocator {
-  using value_type = T;
-  using underlying_allocator_type = Host_Allocator;
-  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
-  const std::optional<size_t> dealloc_hint;
-  const std::optional<size_t> device_id;
-
-#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
-  recycle_allocator() noexcept
-      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  explicit recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  explicit recycle_allocator(
-      recycle_allocator<T, Host_Allocator> const &other) noexcept
-      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(n);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
-  }
-#else
-  recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
-  explicit recycle_allocator(const size_t device_id) noexcept
-      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
-  explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
-      : dealloc_hint(location_id), device_id(device_id) {}
-  explicit recycle_allocator(
-      recycle_allocator<T, Host_Allocator> const &other) noexcept
-  : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
-  T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, false, hpx::get_worker_thread_num() % number_instances, device_id);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
-                                                    device_id);
-  }
-#endif
-
-  template <typename... Args>
-  inline void construct(T *p, Args... args) noexcept {
-    ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
-  }
-  void destroy(T *p) { p->~T(); }
-};
-template <typename T, typename U, typename Host_Allocator>
-constexpr bool
-operator==(recycle_allocator<T, Host_Allocator> const &,
-           recycle_allocator<U, Host_Allocator> const &) noexcept {
-  if constexpr (std::is_same_v<T, U>)
-    return true;
-  else 
-    return false;
-}
-template <typename T, typename U, typename Host_Allocator>
-constexpr bool
-operator!=(recycle_allocator<T, Host_Allocator> const &,
-           recycle_allocator<U, Host_Allocator> const &) noexcept {
-  if constexpr (std::is_same_v<T, U>)
-    return false;
-  else 
-    return true;
-}
-
-/// Recycles not only allocations but also the contents of a buffer
-template <typename T, typename Host_Allocator>
-struct aggressive_recycle_allocator {
-  using value_type = T;
-  using underlying_allocator_type = Host_Allocator;
-  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
-  const std::optional<size_t> dealloc_hint;
-  const std::optional<size_t> device_id;
-
-#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
-  aggressive_recycle_allocator() noexcept
-      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  explicit aggressive_recycle_allocator(size_t hint) noexcept
-      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  explicit aggressive_recycle_allocator(
-      aggressive_recycle_allocator<T, Host_Allocator> const &) noexcept 
-  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
-  T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true); // also initializes the buffer if it isn't reused
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n);
-  }
-#else
-  aggressive_recycle_allocator() noexcept
-      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
-  explicit aggressive_recycle_allocator(const size_t device_id) noexcept
-      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
-  explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
-      : dealloc_hint(location_id), device_id(device_id) {}
-  explicit aggressive_recycle_allocator(
-      recycle_allocator<T, Host_Allocator> const &other) noexcept 
-    : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
-  T *allocate(std::size_t n) {
-    T *data = buffer_recycler::get<T, Host_Allocator>(
-        n, true, dealloc_hint, device_id); // also initializes the buffer
-                                                // if it isn't reused
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    buffer_recycler::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
-                                                    device_id);
-  }
-#endif
-
-#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS
-  template <typename... Args>
-  inline void construct(T *p, Args... args) noexcept {
-    // Do nothing here - we reuse the content of the last owner
-  }
-  void destroy(T *p) {
-    // Do nothing here - Contents will be destroyed when the buffer manager is
-    // destroyed, not before
-  }
-#else
-// Warn about suboptimal performance without recycling
-#pragma message                                                                \
-"Warning: Building without content reusage for aggressive allocators! \
-For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !"
-  template <typename... Args>
-  inline void construct(T *p, Args... args) noexcept {
-    ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
-  }
-  void destroy(T *p) { p->~T(); }
-#endif
-};
-
-template <typename T, typename U, typename Host_Allocator>
-constexpr bool
-operator==(aggressive_recycle_allocator<T, Host_Allocator> const &,
-           aggressive_recycle_allocator<U, Host_Allocator> const &) noexcept {
-  if constexpr (std::is_same_v<T, U>)
-    return true;
-  else 
-    return false;
+using buffer_recycler [[deprecated(
+    "Use buffer_interface from header "
+    "cppuddle/memory_recycling/detail/buffer_management.hpp instead")]] =
+    cppuddle::memory_recycling::detail::buffer_interface;
 }
-template <typename T, typename U, typename Host_Allocator>
-constexpr bool
-operator!=(aggressive_recycle_allocator<T, Host_Allocator> const &,
-           aggressive_recycle_allocator<U, Host_Allocator> const &) noexcept {
-  if constexpr (std::is_same_v<T, U>)
-    return false;
-  else 
-    return true;
-}
-
-} // namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_std = detail::recycle_allocator<T, std::allocator<T>>;
+using recycle_std
+    [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_std<T>;
+
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using aggressive_recycle_std =
-    detail::aggressive_recycle_allocator<T, std::allocator<T>>;
+using aggressive_recycle_std
+    [[deprecated("Use from header std_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::aggressive_recycle_std<T>;
 
-inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); }
+[[deprecated("Use cppuddle::memory_recycling::print_buffer_counters() instead")]] 
+inline void print_performance_counters() {
+  cppuddle::memory_recycling::print_buffer_counters();
+}
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself
-inline void force_cleanup() { detail::buffer_recycler::clean_all(); }
+[[deprecated("Use cppuddle::memory_recycling::force_buffer_cleanup() instead")]]
+inline void force_cleanup() { cppuddle::memory_recycling::force_buffer_cleanup(); }
 /// Deletes all buffers currently marked as unused
-inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); }
+[[deprecated("Use cppuddle::memory_recycling::unused_buffer_cleanup() instead")]]
+inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); }
 /// Deletes all buffers (even ones still marked as used), delete the buffer
 /// managers and the recycler itself. Disallows further usage.
-inline void finalize() { detail::buffer_recycler::finalize(); }
+[[deprecated("Use cppuddle::memory_recycling::finalize() instead")]]
+inline void finalize() { cppuddle::memory_recycling::finalize(); }
+
+[[deprecated("Use cppuddle::max_number_gpus instead")]] constexpr auto max_number_gpus =
+    cppuddle::max_number_gpus;
+[[deprecated("Use cppuddle::number_instances instead")]] constexpr auto number_instances =
+    cppuddle::number_instances;
 
-} // end namespace recycler
+} // namespace recycler
 
 #endif
diff --git a/include/detail/config.hpp b/include/cppuddle/common/config.hpp
similarity index 95%
rename from include/detail/config.hpp
rename to include/cppuddle/common/config.hpp
index 2a06b1af..c9a5f736 100644
--- a/include/detail/config.hpp
+++ b/include/cppuddle/common/config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2023-2023 Gregor Daiß
+// Copyright (c) 2023-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -28,7 +28,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR
 #endif
 #endif
 
-namespace recycler {
+namespace cppuddle {
 
 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
 using mutex_t = hpx::spinlock_no_backoff;
@@ -67,6 +67,6 @@ inline size_t get_device_id(const size_t number_gpus) {
 #endif
 }
 
-} // end namespace recycler
+} // end namespace cppuddle
 
 #endif
diff --git a/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp
new file mode 100644
index 00000000..6a89025b
--- /dev/null
+++ b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp
@@ -0,0 +1,421 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef EXECUTOR_POOLS_MANAGEMENT_HPP
+#define EXECUTOR_POOLS_MANAGEMENT_HPP
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <deque>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <tuple>
+#include <type_traits>
+
+#include "cppuddle/common/config.hpp"
+
+// Need to cuda/hip definitions for default params when NOT
+// drawing from an executor pool
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+#include <hpx/config.hpp>
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+#include <hpx/async_cuda/cuda_executor.hpp>
+#endif
+#endif
+
+// Redefintion required for non-recycling executors
+// Without it, default constructing the executors (independent) would not work
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+// Do only define if Kokkos is not found
+#ifndef KOKKOS_ENABLE_SERIAL
+namespace hpx { namespace kokkos {
+enum class execution_space_mode { global, independent };
+}}
+#endif
+#endif
+
+namespace cppuddle {
+namespace executor_recycling {
+namespace detail {
+
+/// Turns a std::array_mutex into an scoped lock
+template<typename mutex_array_t>
+auto make_scoped_lock_from_array(mutex_array_t& mutexes)
+{
+    return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
+                      mutexes);
+}
+
+template <typename Interface> class round_robin_pool_impl {
+private:
+  std::deque<Interface> pool{};
+  std::vector<size_t> ref_counters{};
+  size_t current_interface{0};
+
+public:
+  template <typename... Ts>
+  round_robin_pool_impl(size_t number_of_executors, Ts... executor_args) {
+    ref_counters.reserve(number_of_executors);
+    for (int i = 0; i < number_of_executors; i++) {
+      pool.emplace_back(executor_args...);
+      ref_counters.emplace_back(0);
+    }
+  }
+  // return a tuple with the interface and its index (to release it later)
+  std::tuple<Interface &, size_t> get_interface() {
+    assert(!(pool.empty())); 
+    size_t last_interface = current_interface;
+    current_interface = (current_interface + 1) % pool.size();
+    ref_counters[last_interface]++;
+    std::tuple<Interface &, size_t> ret(pool[last_interface], last_interface);
+    return ret;
+  }
+  void release_interface(size_t index) { ref_counters[index]--; }
+  bool interface_available(size_t load_limit) {
+    return *(std::min_element(std::begin(ref_counters),
+                              std::end(ref_counters))) < load_limit;
+  }
+  size_t get_current_load() {
+    return *(
+        std::min_element(std::begin(ref_counters), std::end(ref_counters)));
+  }
+  // TODO Remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
+};
+
+template <typename Interface> class priority_pool_impl {
+private:
+  std::deque<Interface> pool{};
+  std::vector<size_t> ref_counters{}; // Ref counters
+  std::vector<size_t> priorities{};   // Ref counters
+public:
+  template <typename... Ts>
+  priority_pool_impl(size_t number_of_executors, Ts... executor_args) {
+    ref_counters.reserve(number_of_executors);
+    priorities.reserve(number_of_executors);
+    for (auto i = 0; i < number_of_executors; i++) {
+      pool.emplace_back(executor_args...);
+      ref_counters.emplace_back(0);
+      priorities.emplace_back(i);
+    }
+  }
+  // return a tuple with the interface and its index (to release it later)
+  std::tuple<Interface &, size_t> get_interface() {
+    auto &interface = pool[priorities[0]];
+    ref_counters[priorities[0]]++;
+    std::tuple<Interface &, size_t> ret(interface, priorities[0]);
+    std::make_heap(std::begin(priorities), std::end(priorities),
+                   [this](const size_t &first, const size_t &second) -> bool {
+                     return ref_counters[first] > ref_counters[second];
+                   });
+    return ret;
+  }
+  void release_interface(size_t index) {
+    ref_counters[index]--;
+    std::make_heap(std::begin(priorities), std::end(priorities),
+                   [this](const size_t &first, const size_t &second) -> bool {
+                     return ref_counters[first] > ref_counters[second];
+                   });
+  }
+  bool interface_available(size_t load_limit) {
+    return ref_counters[priorities[0]] < load_limit;
+  }
+  size_t get_current_load() { return ref_counters[priorities[0]]; }
+  // TODO remove
+  /* size_t get_next_device_id() { */
+  /*   return 0; // single gpu pool */
+  /* } */
+};
+
+/// Access/Concurrency Control for executor pool implementation
+class executor_pool {
+public:
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init(size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init(number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool, typename... Ts>
+  static void init_executor_pool(size_t pool_id, size_t number_of_executors, Ts ... executor_args) {
+    executor_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_executors,
+                                                      executor_args...);
+  }
+  template <typename Interface, typename Pool> static void cleanup() {
+    executor_pool_implementation<Interface, Pool>::cleanup();
+  }
+  template <typename Interface, typename Pool>
+  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) {
+    return executor_pool_implementation<Interface, Pool>::get_interface(gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static void release_interface(size_t index, const size_t gpu_id) noexcept {
+    executor_pool_implementation<Interface, Pool>::release_interface(index,
+        gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept {
+    return executor_pool_implementation<Interface, Pool>::interface_available(
+        load_limit, gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static size_t get_current_load(const size_t gpu_id = 0) noexcept {
+    return executor_pool_implementation<Interface, Pool>::get_current_load(
+        gpu_id);
+  }
+  template <typename Interface, typename Pool>
+  static size_t get_next_device_id(const size_t number_gpus) noexcept {
+    // TODO add round robin and min strategy
+    return cppuddle::get_device_id(number_gpus);
+  }
+
+  template <typename Interface, typename Pool>
+  static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
+    executor_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
+  }
+
+  template <typename Interface, typename Pool>
+  static void select_device(size_t gpu_id) {
+    executor_pool_implementation<Interface, Pool>::select_device(gpu_id);
+  }
+
+private:
+  executor_pool() = default;
+
+private:
+  template <typename Interface, typename Pool> class executor_pool_implementation {
+  public:
+    /// Deprecated! Use init_on_all_gpu or init_on_gpu
+    template <typename... Ts>
+    static void init(size_t number_of_executors, Ts ... executor_args) {
+      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */
+      /*               "deprecated executor_pool::init does not support multigpu"); */
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      instance().executorpools.emplace_back(number_of_executors, executor_args...);
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
+    template <typename... Ts>
+    static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_executors > 0) {
+        for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) {
+          instance().select_gpu_function(gpu_id);
+          instance().executorpools.emplace_back(number_of_executors,
+                                              executor_args...);
+        }
+      }
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    /// Per-GPU init allowing for different init parameters depending on the GPU 
+    /// (useful for executor that expect an GPU-id during construction)
+    template <typename... Ts>
+    static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      if (number_of_executors > 0) {
+        instance().select_gpu_function(gpu_id);
+        instance().executorpools.emplace_back(number_of_executors, 
+                                            executor_args...);
+      }
+      assert(instance().executorpools.size() <= cppuddle::max_number_gpus);
+    }
+
+    // TODO add/rename into finalize?
+    static void cleanup() {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      assert(instance().executorpools.size() == cppuddle::max_number_gpus);
+      instance().executorpools.clear();
+    }
+
+    static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].get_interface();
+    }
+    static void release_interface(size_t index, const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      instance().executorpools[gpu_id].release_interface(index);
+    }
+    static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].interface_available(load_limit);
+    }
+    static size_t get_current_load(const size_t gpu_id = 0) {
+      std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
+      assert(gpu_id < instance().executorpools.size());
+      return instance().executorpools[gpu_id].get_current_load();
+    }
+    // TODO deprecated! Remove...
+    /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
+    /*   std::lock_guard<cppuddle::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
+    /*   assert(instance().executorpools.size() == cppuddle::max_number_gpus); */
+    /*   return instance().executorpools[gpu_id].get_next_device_id(); */
+    /* } */
+
+    static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
+      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
+      instance().select_gpu_function = select_gpu_function;
+    }
+
+    static void select_device(size_t gpu_id) {
+      instance().select_gpu_function(gpu_id);
+    }
+
+  private:
+    executor_pool_implementation() = default;
+    cppuddle::mutex_t pool_mut{};
+    std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
+      // By default no multi gpu support
+      assert(cppuddle::max_number_gpus == 1 || instance().executorpools.size() == 1);
+      assert(gpu_id == 0);
+    };
+
+    std::deque<Pool> executorpools{};
+    std::array<cppuddle::mutex_t, cppuddle::max_number_gpus> gpu_mutexes;
+
+    static executor_pool_implementation& instance(void) {
+      static executor_pool_implementation pool_instance{};
+      return pool_instance;
+    }
+
+  public:
+    ~executor_pool_implementation() = default;
+    // Bunch of constructors we don't need
+    executor_pool_implementation(executor_pool_implementation const &other) =
+        delete;
+    executor_pool_implementation &
+    operator=(executor_pool_implementation const &other) = delete;
+    executor_pool_implementation(executor_pool_implementation &&other) = delete;
+    executor_pool_implementation &
+    operator=(executor_pool_implementation &&other) = delete;
+  };
+
+public:
+  ~executor_pool() = default;
+  // Bunch of constructors we don't need
+  executor_pool(executor_pool const &other) = delete;
+  executor_pool &operator=(executor_pool const &other) = delete;
+  executor_pool(executor_pool &&other) = delete;
+  executor_pool &operator=(executor_pool &&other) = delete;
+};
+
+#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
+
+// Warn about suboptimal performance without recycling
+#pragma message                                                                \
+"Warning: Building without executor recycling! Use only for performance testing! \
+For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!"
+
+/// Slow version of the executor_interface that does not draw its
+/// executors (Interface) from the pool but creates them instead.
+/// Only meant for performance comparisons and only works with cuda/kokkos executors
+template <typename Interface, typename Pool> class executor_interface {
+public:
+
+  template <typename Dummy = Interface>
+  explicit executor_interface(size_t gpu_id,
+      std::enable_if_t<std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
+      : gpu_id(gpu_id), interface(gpu_id) {}
+  template <typename Dummy = Interface>
+  explicit executor_interface(std::enable_if_t<!std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
+      : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {}
+
+  executor_interface(const executor_interface &other) = delete;
+  executor_interface &operator=(const executor_interface &other) = delete;
+  executor_interface(executor_interface &&other) = delete;
+  executor_interface &operator=(executor_interface &&other) = delete;
+  ~executor_interface() {
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) post(F &&f, Ts &&... ts) {
+    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
+    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  inline decltype(auto) get_future() {
+    return interface.get_future();
+  }
+
+  // allow implict conversion
+  operator Interface &() { // NOLINT
+    return interface;
+  }
+
+private:
+  size_t gpu_id;
+
+public:
+  Interface interface;
+};
+#else
+/// Stream interface for RAII purposes
+/// Draws executor from the executor pool and releases it upon
+/// destruction
+template <typename Interface, typename Pool> class executor_interface {
+public:
+  explicit executor_interface(size_t gpu_id)
+      : t(executor_pool::get_interface<Interface, Pool>(gpu_id)),
+        interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {}
+
+  executor_interface(const executor_interface &other) = delete;
+  executor_interface &operator=(const executor_interface &other) = delete;
+  executor_interface(executor_interface &&other) = delete;
+  executor_interface &operator=(executor_interface &&other) = delete;
+  ~executor_interface() {
+    executor_pool::release_interface<Interface, Pool>(interface_index, gpu_id);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) post(F &&f, Ts &&... ts) {
+    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  template <typename F, typename... Ts>
+  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
+    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  inline decltype(auto) get_future() {
+    return interface.get_future();
+  }
+
+  // allow implict conversion
+  operator Interface &() { // NOLINT
+    return interface;
+  }
+
+private:
+  std::tuple<Interface &, size_t> t;
+  size_t interface_index;
+  size_t gpu_id;
+
+public:
+  Interface &interface;
+};
+#endif
+
+} // namespace detail
+} // namespace executor_recycling
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
new file mode 100644
index 00000000..49a6d42d
--- /dev/null
+++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
@@ -0,0 +1,32 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef EXECUTOR_POOLS_INTERFACE_HPP
+#define EXECUTOR_POOLS_INTERFACE_HPP
+
+#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp"
+
+namespace cppuddle {
+namespace executor_recycling {
+
+template <typename Interface>
+using round_robin_pool_impl =
+        detail::round_robin_pool_impl<Interface>;
+
+template <typename Interface>
+using priority_pool_impl =
+        detail::priority_pool_impl<Interface>;
+
+using executor_pool =
+        detail::executor_pool;
+
+template <typename Interface, typename Pool>
+using executor_interface =
+        detail::executor_interface<Interface, Pool>;
+
+} // end namespace executor_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp
new file mode 100644
index 00000000..b9d456cc
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2022-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp"
+
+#ifndef AGGREGATION_EXECUTOR_POOL_HPP
+#define AGGREGATION_EXECUTOR_POOL_HPP
+
+namespace cppuddle {
+namespace kernel_aggregation {
+namespace detail {
+
+template <const char *kernelname, class Interface, class Pool>
+class aggregation_pool {
+public:
+  /// interface
+  template <typename... Ts>
+  static void init(size_t number_of_executors, size_t slices_per_executor,
+                   aggregated_executor_modes mode, size_t num_devices = 1) {
+    if (is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to initialize cppuddle aggregation pool twice") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    if (num_devices > cppuddle::max_number_gpus) {
+      throw std::runtime_error(
+          std::string(
+              "Trying to initialize aggregation with more devices than the "
+              "maximum number of GPUs given at compiletime") +
+          " Agg pool name: " + std::string(kernelname));
+    }
+    number_devices = num_devices;
+    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {
+
+      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+      assert(instance()[gpu_id].aggregation_executor_pool.empty());
+      for (int i = 0; i < number_of_executors; i++) {
+        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,
+                                                        mode, gpu_id);
+      }
+      instance()[gpu_id].slices_per_executor = slices_per_executor;
+      instance()[gpu_id].mode = mode;
+    }
+    is_initialized = true;
+  }
+
+  /// Will always return a valid executor slice
+  static decltype(auto) request_executor_slice(void) {
+    if (!is_initialized) {
+      throw std::runtime_error(
+          std::string("Trying to use cppuddle aggregation pool without first calling init") +
+          " Agg poolname: " + std::string(kernelname));
+    }
+    const size_t gpu_id = cppuddle::get_device_id(number_devices);
+    /* const size_t gpu_id = 1; */
+    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);
+    assert(!instance()[gpu_id].aggregation_executor_pool.empty());
+    std::optional<hpx::lcos::future<
+        typename aggregated_executor<Interface>::executor_slice>>
+        ret;
+    size_t local_id = (instance()[gpu_id].current_interface) %
+                      instance()[gpu_id].aggregation_executor_pool.size();
+    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+    // Expected case: current aggregation executor is free
+    if (ret.has_value()) {
+      return ret;
+    }
+    // current interface is bad -> find free one
+    size_t abort_counter = 0;
+    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;
+    do {
+      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface
+                 instance()[gpu_id].aggregation_executor_pool.size();
+      ret =
+          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();
+      if (ret.has_value()) {
+        return ret;
+      }
+      abort_counter++;
+    } while (abort_counter <= abort_number);
+    // Everything's busy -> create new aggregation executor (growing pool) OR
+    // return empty optional
+    if (instance()[gpu_id].growing_pool) {
+      instance()[gpu_id].aggregation_executor_pool.emplace_back(
+          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);
+      instance()[gpu_id].current_interface =
+          instance()[gpu_id].aggregation_executor_pool.size() - 1;
+      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);
+      ret = instance()[gpu_id]
+                .aggregation_executor_pool[instance()[gpu_id].current_interface]
+                .request_executor_slice();
+      assert(ret.has_value()); // fresh executor -- should always have slices
+                               // available
+    }
+    return ret;
+  }
+
+private:
+  std::deque<aggregated_executor<Interface>> aggregation_executor_pool;
+  std::atomic<size_t> current_interface{0};
+  size_t slices_per_executor;
+  aggregated_executor_modes mode;
+  bool growing_pool{true};
+
+private:
+  /// Required for dealing with adding elements to the deque of
+  /// aggregated_executors
+  aggregation_mutex_t pool_mutex;
+  /// Global access instance
+  static std::unique_ptr<aggregation_pool[]>& instance(void) {
+    static std::unique_ptr<aggregation_pool[]> pool_instances{
+        new aggregation_pool[cppuddle::max_number_gpus]};
+    return pool_instances;
+  }
+  static inline size_t number_devices = 1;
+  static inline bool is_initialized = false;
+  aggregation_pool() = default;
+
+public:
+  ~aggregation_pool() = default;
+  // Bunch of constructors we don't need
+  aggregation_pool(aggregation_pool const &other) = delete;
+  aggregation_pool &operator=(aggregation_pool const &other) = delete;
+  aggregation_pool(aggregation_pool &&other) = delete;
+  aggregation_pool &operator=(aggregation_pool &&other) = delete;
+};
+
+} // namespace detail
+} // namespace kernel_aggregation
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
new file mode 100644
index 00000000..dfc76622
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
@@ -0,0 +1,1061 @@
+// Copyright (c) 2022-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP
+#define AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP
+
+#ifndef CPPUDDLE_HAVE_HPX
+#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON"
+#endif
+
+#include <stdexcept>
+// When defined, CPPuddle will run more checks
+// about the order of aggregated method calls. 
+// Best defined before including this header when needed
+// (hence commented out here)
+//#define DEBUG_AGGREGATION_CALLS 1
+
+#include <stdio.h>
+
+#include <any>
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <unordered_map>
+
+#include <hpx/futures/future.hpp>
+#include <hpx/hpx_init.hpp>
+#include <hpx/include/async.hpp>
+#include <hpx/include/iostreams.hpp>
+#include <hpx/include/lcos.hpp>
+#include <hpx/lcos/promise.hpp>
+#include <hpx/mutex.hpp>
+
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+// required for defining type traits using cuda executor as underlying
+// aggregation executors
+#include <hpx/async_cuda/cuda_executor.hpp>
+#endif
+
+#include <boost/core/demangle.hpp>
+#include <boost/format.hpp>
+
+#include "cppuddle/common/config.hpp"
+// get direct access to the buffer manangment
+#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
+// get normal access to the executor pools
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
+
+#ifndef CPPUDDLE_HAVE_HPX_MUTEX
+#pragma message                                                                \
+    "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF"
+#pragma message                                                                \
+    "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex"
+#endif
+namespace cppuddle {
+namespace kernel_aggregation {
+namespace detail {
+  using aggregation_mutex_t = hpx::mutex;
+
+//===============================================================================
+//===============================================================================
+// Helper functions/classes
+
+/// Constructs a tuple with copies (to store temporaries in aggregated function
+/// calls) yet also supporting references (on the users own risk...)
+template <typename... Ts>
+std::tuple<Ts...> make_tuple_supporting_references(Ts &&...ts) {
+  return std::tuple<Ts...>{std::forward<Ts>(ts)...};
+}
+
+/// Print some specific values that we can, but don't bother for most types
+/// (such as vector)
+template <typename T> std::string print_if_possible(T val) {
+  if constexpr (std::is_convertible_v<T, std::string>) {
+    return val;
+  } else if constexpr (std::is_integral_v<T> || std::is_floating_point_v<T>) {
+    return std::to_string(val);
+  } else if constexpr (std::is_pointer_v<T>) {
+    // Pretty printing pointer sort of only works well with %p
+    // TODO Try using std::format as soon as we can move to C++20
+    std::unique_ptr<char[]> debug_string(new char[128]());
+    snprintf(debug_string.get(), 128, "%p", val);
+    return std::string(debug_string.get());
+  } else {
+    return std::string("cannot print value");
+  }
+}
+
+/// Helper class for the helper class that prints tuples -- do not use this
+/// directly
+template <class TupType, size_t... I>
+void print_tuple(const TupType &_tup, std::index_sequence<I...>) {
+  (..., (hpx::cout << (I == 0 ? "" : ", ")
+                   << print_if_possible(std::get<I + 1>(_tup))));
+}
+
+/// Helper class for printing tuples (first component should be a function
+/// pointer, remaining components the function arguments)
+template <class... T> void print_tuple(const std::tuple<T...> &_tup) {
+  // Use pointer and sprintf as boost::format refused to NOT cast the pointer
+  // address to 1...
+  // TODO Try using std::format as soon as we can move to C++20
+  std::unique_ptr<char[]> debug_string(new char[128]());
+  snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (",
+           std::get<0>(_tup));
+  hpx::cout << debug_string.get();
+  print_tuple(_tup, std::make_index_sequence<sizeof...(T) - 1>());
+  hpx::cout << ")";
+}
+
+//===============================================================================
+//===============================================================================
+template <typename Executor, typename F, typename... Ts>
+void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) {
+  hpx::apply(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
+}
+
+template <typename Executor, typename F, typename... Ts>
+hpx::lcos::future<void> exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) {
+  return hpx::async(exec, std::forward<F>(f), std::forward<Ts>(ts)...);
+}
+
+/// Manages the launch conditions for aggregated function calls
+/// type/value-errors
+/** Launch conditions: All slice executors must have called the same function
+ * (tracked by future all_slices_ready)
+ * AND
+ * Previous aggregated_function_call on the same Executor must have been
+ * launched (tracked by future stream_future)
+ * All function calls received from the slice executors are checked if they
+ * match the first one in both types and values (throws exception otherwise)
+ */
+
+template <typename Executor> class aggregated_function_call {
+private:
+  std::atomic<size_t> slice_counter = 0;
+
+  /// Promise to be set when all slices have visited this function call
+  /* hpx::lcos::local::promise<void> slices_ready_promise; */
+  /// Tracks if all slices have visited this function call
+  /* hpx::lcos::future<void> all_slices_ready = slices_ready_promise.get_future(); */
+  /// How many slices can we expect?
+  const size_t number_slices;
+  const bool async_mode;
+
+  Executor &underlying_executor;
+
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+#pragma message                                                                \
+    "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..."
+  /// Stores the function call of the first slice as reference for error
+  /// checking
+  std::any function_tuple;
+  /// Stores the string of the first function call for debug output
+  std::string debug_type_information;
+  aggregation_mutex_t debug_mut;
+#endif
+
+  std::vector<hpx::lcos::local::promise<void>> potential_async_promises{};
+
+public:
+  aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec)
+      : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) {
+    if (async_mode)
+      potential_async_promises.resize(number_slices);
+  }
+  ~aggregated_function_call(void) {
+    // All slices should have done this call
+    assert(slice_counter == number_slices);
+    // assert(!all_slices_ready.valid());
+  }
+  /// Returns true if all required slices have visited this point
+  bool sync_aggregation_slices(hpx::lcos::future<void> &stream_future) {
+    assert(!async_mode);
+    assert(potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    if (local_counter == number_slices - 1) {
+      return true;
+    }
+    else return false;
+  }
+  template <typename F, typename... Ts>
+  void post_when(hpx::lcos::future<void> &stream_future, F &&f, Ts &&...ts) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+    // needed for concurrent access to function_tuple and debug_type_information
+    // Not required for normal use
+    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+#endif
+    assert(!async_mode);
+    assert(potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+
+    if (local_counter == 0) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto tmp_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      function_tuple = tmp_tuple;
+      debug_type_information = typeid(decltype(tmp_tuple)).name();
+#endif
+
+    } else {
+      //
+      // This scope checks if both the type and the values of the current call
+      // match the original call To be used in debug build...
+      //
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto comparison_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      try {
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        if (comparison_tuple != orig_call_tuple) {
+          throw std::runtime_error(
+              "Values of post function arguments (or function "
+              "itself) do not match ");
+        }
+      } catch (const std::bad_any_cast &e) {
+        hpx::cout
+            << "\nMismatched types error in aggregated post call of executor "
+            << ": " << e.what() << "\n";
+        hpx::cout << "Expected types:\t\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        hpx::cout << "\nGot types:\t\t "
+                  << boost::core::demangle(
+                         typeid(decltype(comparison_tuple)).name())
+                  << "\n"
+                  << std::endl;
+        // throw;
+      } catch (const std::runtime_error &e) {
+        hpx::cout
+            << "\nMismatched values error in aggregated post call of executor "
+            << ": " << e.what() << std::endl;
+        hpx::cout << "Types (matched):\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        hpx::cout << "\nExpected values:\t ";
+        print_tuple(orig_call_tuple);
+        hpx::cout << "\nGot values:\t\t ";
+        print_tuple(comparison_tuple);
+        hpx::cout << std::endl << std::endl;
+        // throw;
+      }
+#endif
+    }
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    // Check exit criteria: Launch function call continuation by setting the
+    // slices promise
+    if (local_counter == number_slices - 1) {
+      exec_post_wrapper<Executor, F, Ts...>(underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
+      //slices_ready_promise.set_value();
+    }
+  }
+  template <typename F, typename... Ts>
+  hpx::lcos::future<void> async_when(hpx::lcos::future<void> &stream_future,
+                                     F &&f, Ts &&...ts) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+    // needed for concurrent access to function_tuple and debug_type_information
+    // Not required for normal use
+    std::lock_guard<aggregation_mutex_t> guard(debug_mut);
+#endif
+    assert(async_mode);
+    assert(!potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    if (local_counter == 0) {
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto tmp_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      function_tuple = tmp_tuple;
+      debug_type_information = typeid(decltype(tmp_tuple)).name();
+#endif
+    } else {
+      //
+      // This scope checks if both the type and the values of the current call
+      // match the original call To be used in debug build...
+      //
+#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS)
+      auto comparison_tuple =
+          make_tuple_supporting_references(f, std::forward<Ts>(ts)...);
+      try {
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        if (comparison_tuple != orig_call_tuple) {
+          throw std::runtime_error(
+              "Values of async function arguments (or function "
+              "itself) do not match ");
+        }
+      } catch (const std::bad_any_cast &e) {
+        hpx::cout
+            << "\nMismatched types error in aggregated async call of executor "
+            << ": " << e.what() << "\n";
+        hpx::cout << "Expected types:\t\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        hpx::cout << "\nGot types:\t\t "
+                  << boost::core::demangle(
+                         typeid(decltype(comparison_tuple)).name())
+                  << "\n"
+                  << std::endl;
+        // throw;
+      } catch (const std::runtime_error &e) {
+        hpx::cout
+            << "\nMismatched values error in aggregated async call of executor "
+            << ": " << e.what() << std::endl;
+        hpx::cout << "Types (matched):\t "
+                  << boost::core::demangle(debug_type_information.c_str());
+        auto orig_call_tuple =
+            std::any_cast<decltype(comparison_tuple)>(function_tuple);
+        hpx::cout << "\nExpected values:\t ";
+        print_tuple(orig_call_tuple);
+        hpx::cout << "\nGot values:\t\t ";
+        print_tuple(comparison_tuple);
+        hpx::cout << std::endl << std::endl;
+        // throw;
+      }
+#endif
+    }
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    assert(potential_async_promises.size() == number_slices);
+    hpx::lcos::future<void> ret_fut =
+        potential_async_promises[local_counter].get_future();
+    if (local_counter == number_slices - 1) {
+      /* slices_ready_promise.set_value(); */
+      auto fut = exec_async_wrapper<Executor, F, Ts...>(
+          underlying_executor, std::forward<F>(f), std::forward<Ts>(ts)...);
+      fut.then([this](auto &&fut) {
+        for (auto &promise : potential_async_promises) {
+          promise.set_value();
+        }
+      });
+    }
+    // Check exit criteria: Launch function call continuation by setting the
+    // slices promise
+    return ret_fut;
+  }
+  template <typename F, typename... Ts>
+  hpx::lcos::shared_future<void> wrap_async(hpx::lcos::future<void> &stream_future,
+                                     F &&f, Ts &&...ts) {
+    assert(async_mode);
+    assert(!potential_async_promises.empty());
+    const size_t local_counter = slice_counter++;
+    assert(local_counter < number_slices);
+    assert(slice_counter < number_slices + 1);
+    assert(potential_async_promises.size() == number_slices);
+    hpx::lcos::shared_future<void> ret_fut =
+        potential_async_promises[local_counter].get_shared_future();
+    if (local_counter == number_slices - 1) {
+      auto fut = f(std::forward<Ts>(ts)...);
+      fut.then([this](auto &&fut) {
+        // TODO just use one promise
+        for (auto &promise : potential_async_promises) {
+          promise.set_value();
+        }
+      });
+    }
+    return ret_fut;
+  }
+  // We need to be able to copy or no-except move for std::vector..
+  aggregated_function_call(const aggregated_function_call &other) = default;
+  aggregated_function_call &
+  operator=(const aggregated_function_call &other) = default;
+  aggregated_function_call(aggregated_function_call &&other) = default;
+  aggregated_function_call &
+  operator=(aggregated_function_call &&other) = default;
+};
+
+//===============================================================================
+//===============================================================================
+
+enum class aggregated_executor_modes { EAGER = 1, STRICT, ENDLESS };
+/// Declaration since the actual allocator is only defined after the Executors
+template <typename T, typename Host_Allocator, typename Executor>
+class allocator_slice;
+
+/// Executor Class that aggregates function calls for specific kernels
+/** Executor is not meant to be used directly. Instead it yields multiple
+ * executor_slice objects. These serve as interfaces. Slices from the same
+ * aggregated_executor are meant to execute the same function calls but on
+ * different data (i.e. different tasks)
+ */
+template <typename Executor> class aggregated_executor {
+private:
+  //===============================================================================
+  // Misc private avariables:
+  //
+  std::atomic<bool> slices_exhausted;
+
+  std::atomic<bool> executor_slices_alive;
+  std::atomic<bool> buffers_in_use;
+  std::atomic<size_t> dealloc_counter;
+
+  const aggregated_executor_modes mode;
+  const size_t max_slices;
+  std::atomic<size_t> current_slices;
+  /// Wrapper to the executor interface from the stream pool
+  /// Automatically hooks into the stream_pools reference counting
+  /// for cpu/gpu load balancing
+  std::unique_ptr<cppuddle::executor_recycling::executor_interface<
+      Executor, cppuddle::executor_recycling::round_robin_pool_impl<Executor>>>
+      executor_wrapper;
+
+public:
+  size_t gpu_id;
+  // Subclasses
+
+  /// Slice class - meant as a scope interface to the aggregated executor
+  class executor_slice {
+  public:
+    aggregated_executor<Executor> &parent;
+  private:
+    /// Executor is a slice of this aggregated_executor
+    /// How many functions have been called - required to enforce sequential
+    /// behaviour of kernel launches
+    size_t launch_counter{0};
+    size_t buffer_counter{0};
+    bool notify_parent_about_destruction{true};
+
+  public:
+    /// How many slices are there overall - required to check the launch
+    /// criteria
+    const size_t number_slices;
+    const size_t id;
+    using executor_t = Executor;
+    executor_slice(aggregated_executor &parent, const size_t slice_id,
+                   const size_t number_slices)
+        : parent(parent), notify_parent_about_destruction(true),
+          number_slices(number_slices), id(slice_id) {
+  }
+    ~executor_slice(void) {
+      // Don't notify parent if we moved away from this executor_slice
+      if (notify_parent_about_destruction) {
+        // Executor should be done by the time of destruction
+        // -> check here before notifying parent
+
+        // parent still in execution mode?
+        assert(parent.slices_exhausted == true);
+        // all kernel launches done?
+        assert(launch_counter == parent.function_calls.size());
+        // Notifiy parent that this aggregation slice is one
+        parent.reduce_usage_counter();
+      }
+    }
+    executor_slice(const executor_slice &other) = delete;
+    executor_slice &operator=(const executor_slice &other) = delete;
+    executor_slice(executor_slice &&other)
+        : parent(other.parent), launch_counter(std::move(other.launch_counter)),
+          buffer_counter(std::move(other.buffer_counter)),
+          number_slices(std::move(other.number_slices)),
+          id(std::move(other.id)) {
+      other.notify_parent_about_destruction = false;
+    }
+    executor_slice &operator=(executor_slice &&other) {
+      parent = other.parent;
+      launch_counter = std::move(other.launch_counter);
+      buffer_counter = std::move(other.buffer_counter);
+      number_slices = std::move(other.number_slices);
+      id = std::move(other.id);
+      other.notify_parent_about_destruction = false;
+    }
+    template <typename T, typename Host_Allocator>
+    allocator_slice<T, Host_Allocator, Executor> make_allocator() {
+      return allocator_slice<T, Host_Allocator, Executor>(*this);
+    }
+    bool sync_aggregation_slices() {
+      assert(parent.slices_exhausted == true);
+      auto ret = parent.sync_aggregation_slices(launch_counter);
+      launch_counter++;
+      return ret;
+    }
+    template <typename F, typename... Ts> void post(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      parent.post(launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+    }
+    template <typename F, typename... Ts>
+    hpx::lcos::future<void> async(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      hpx::lcos::future<void> ret_fut = parent.async(
+          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+      return ret_fut;
+    }
+
+    // OneWay Execution
+    template <typename F, typename... Ts>
+    friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t,
+        executor_slice& exec, F&& f, Ts&&... ts)
+    {
+        return exec.post(std::forward<F>(f), std::forward<Ts>(ts)...);
+    }
+
+    // TwoWay Execution
+    template <typename F, typename... Ts>
+    friend decltype(auto) tag_invoke(
+        hpx::parallel::execution::async_execute_t, executor_slice& exec,
+        F&& f, Ts&&... ts)
+    {
+        return exec.async(
+            std::forward<F>(f), std::forward<Ts>(ts)...);
+    }
+
+    template <typename F, typename... Ts>
+    hpx::lcos::shared_future<void> wrap_async(F &&f, Ts &&...ts) {
+      // we should only execute function calls once all slices
+      // have been given away (-> Executor Slices start)
+      assert(parent.slices_exhausted == true);
+      hpx::lcos::shared_future<void> ret_fut = parent.wrap_async(
+          launch_counter, std::forward<F>(f), std::forward<Ts>(ts)...);
+      launch_counter++;
+      return ret_fut;
+    }
+
+    /// Get new aggregated buffer (might have already been allocated been
+    /// allocated by different slice)
+    template <typename T, typename Host_Allocator> T *get(const size_t size) {
+      assert(parent.slices_exhausted == true);
+      T *aggregated_buffer =
+          parent.get<T, Host_Allocator>(size, buffer_counter);
+      buffer_counter++;
+      assert(buffer_counter > 0);
+      return aggregated_buffer;
+    }
+
+    Executor& get_underlying_executor(void) {
+      assert(parent.executor_wrapper);
+      return *(parent.executor_wrapper);
+    }
+  };
+
+  // deprecated name...
+  using Executor_Slice [[deprectated("Renamed: Use executor_slice instead")]] = executor_slice;
+
+  //===============================================================================
+
+  hpx::lcos::local::promise<void> slices_full_promise;
+  /// Promises with the slice executors -- to be set when the starting criteria
+  /// is met
+  std::vector<hpx::lcos::local::promise<executor_slice>> executor_slices;
+  /// List of aggregated function calls - function will be launched when all
+  /// slices have called it
+  std::deque<aggregated_function_call<Executor>> function_calls;
+  /// For synchronizing the access to the function calls list
+  aggregation_mutex_t mut;
+
+  /// Data entry for a buffer allocation: void* pointer, size_t for
+  /// buffer-size, atomic for the slice counter, location_id, gpu_id
+  using buffer_entry_t =
+      std::tuple<void*, const size_t, std::atomic<size_t>, bool, const size_t, size_t>;
+  /// Keeps track of the aggregated buffer allocations done in all the slices
+  std::deque<buffer_entry_t> buffer_allocations;
+  /// Map pointer to deque index for fast access in the deallocations
+  std::unordered_map<void*,size_t> buffer_allocations_map;
+  /// For synchronizing the access to the buffer_allocations
+  aggregation_mutex_t buffer_mut;
+  std::atomic<size_t> buffer_counter = 0;
+
+  /// Get new buffer OR get buffer already allocated by different slice
+  template <typename T, typename Host_Allocator>
+  T *get(const size_t size, const size_t slice_alloc_counter) {
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    assert(executor_slices_alive == true);
+    // Add aggreated buffer entry in case it hasn't happened yet for this call
+    // First: Check if it already has happened
+    if (buffer_counter <= slice_alloc_counter) {
+      // we might be the first! Lock...
+      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      // ... and recheck
+      if (buffer_counter <= slice_alloc_counter) {
+        constexpr bool manage_content_lifetime = false;
+        buffers_in_use = true;
+
+        // Default location -- useful for GPU builds as we otherwise create way too
+        // many different buffers for different aggregation sizes on different GPUs
+        /* size_t location_id = gpu_id * instances_per_gpu; */
+        // Use integer conversion to only use 0 16 32 ... as buckets
+        size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; 
+#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+        if (max_slices == 1) {
+          // get prefered location: aka the current hpx threads location
+          // Usually handy for CPU builds where we want to use the buffers
+          // close to the current CPU core
+          /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */
+          /* location_id = (gpu_id) * instances_per_gpu; */
+          // division makes sure that we always use the same instance to store our gpu buffers.
+        }
+#endif
+        // Get shiny and new buffer that will be shared between all slices
+        // Buffer might be recycled from previous allocations by the
+        // buffer_interface...
+        T *aggregated_buffer =
+            cppuddle::memory_recycling::detail::buffer_interface::get<
+                T, Host_Allocator>(size, manage_content_lifetime, location_id,
+                                   gpu_id);
+        // Create buffer entry for this buffer
+        buffer_allocations.emplace_back(static_cast<void *>(aggregated_buffer),
+                                        size, 1, true, location_id, gpu_id);
+
+#ifndef NDEBUG
+        // if previousely used the buffer should not be in usage anymore
+        const auto exists = buffer_allocations_map.count(
+            static_cast<void *>(aggregated_buffer));
+        if (exists > 0) {
+          const auto previous_usage_id =
+              buffer_allocations_map[static_cast<void *>(aggregated_buffer)];
+          const auto &valid =
+              std::get<3>(buffer_allocations[previous_usage_id]);
+          assert(!valid);
+        }
+#endif
+        buffer_allocations_map.insert_or_assign(static_cast<void *>(aggregated_buffer),
+            buffer_counter);
+
+        assert (buffer_counter == slice_alloc_counter);
+        buffer_counter = buffer_allocations.size();
+
+        // Return buffer
+        return aggregated_buffer;
+      }
+    }
+    assert(buffers_in_use == true);
+    assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid
+    assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1);
+
+    // Buffer entry should already exist:
+    T *aggregated_buffer = static_cast<T *>(
+        std::get<0>(buffer_allocations[slice_alloc_counter]));
+    // Error handling: Size is wrong?
+    assert(size == std::get<1>(buffer_allocations[slice_alloc_counter]));
+    // Notify that one more slice has visited this buffer allocation
+    std::get<2>(buffer_allocations[slice_alloc_counter])++;
+    return aggregated_buffer;
+  }
+
+  /// Notify buffer list that one slice is done with the buffer
+  template <typename T, typename Host_Allocator>
+  void mark_unused(T *p, const size_t size) {
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+
+    void *ptr_key = static_cast<void*>(p);
+    size_t slice_alloc_counter = buffer_allocations_map[p];
+
+    assert(slice_alloc_counter < buffer_allocations.size());
+    /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] =
+        buffer_allocations[slice_alloc_counter];*/
+    auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]);
+    const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]);
+    auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]);
+    auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]);
+    const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]);
+    const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]);
+    assert(valid);
+    T *buffer_pointer = static_cast<T *>(buffer_pointer_void);
+
+    assert(buffer_size == size);
+    assert(p == buffer_pointer);
+    // assert(buffer_pointer == p || buffer_pointer == nullptr);
+    // Slice is done with this buffer
+    buffer_allocation_counter--;
+    // Check if all slices are done with this buffer?
+    if (buffer_allocation_counter == 0) {
+      // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage
+      std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+      // Only mark unused if another buffer has not done so already (and marked
+      // it as invalid)
+      if (valid) {
+        assert(buffers_in_use == true);
+        cppuddle::memory_recycling::detail::buffer_interface::mark_unused<
+            T, Host_Allocator>(buffer_pointer, buffer_size, location_id,
+                               gpu_id);
+        // mark buffer as invalid to prevent any other slice from marking the
+        // buffer as unused
+        valid = false;
+
+        const size_t current_deallocs = ++dealloc_counter;
+        if (current_deallocs == buffer_counter) {
+          std::lock_guard<aggregation_mutex_t> guard(mut);
+          buffers_in_use = false;
+          if (!executor_slices_alive && !buffers_in_use) {
+            slices_exhausted = false;
+            // Release executor
+            executor_wrapper.reset(nullptr);
+          }
+        }
+      }
+    }
+  }
+
+  //===============================================================================
+  // Public Interface
+public:
+  hpx::lcos::future<void> current_continuation;
+  hpx::lcos::future<void> last_stream_launch_done;
+  std::atomic<size_t> overall_launch_counter = 0;
+
+  /// Only meant to be accessed by the slice executors
+  bool sync_aggregation_slices(const size_t slice_launch_counter) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, false, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].sync_aggregation_slices(
+            last_stream_launch_done);
+      }
+    }
+
+    return function_calls[slice_launch_counter].sync_aggregation_slices(
+        last_stream_launch_done);
+  }
+
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, false, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        function_calls[slice_launch_counter].post_when(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+        return;
+      }
+    }
+
+    function_calls[slice_launch_counter].post_when(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+    return;
+  }
+
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  hpx::lcos::future<void> async(const size_t slice_launch_counter, F &&f,
+                                Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, true, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].async_when(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+      }
+    }
+
+    return function_calls[slice_launch_counter].async_when(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+  /// Only meant to be accessed by the slice executors
+  template <typename F, typename... Ts>
+  hpx::lcos::shared_future<void> wrap_async(const size_t slice_launch_counter, F &&f,
+                                Ts &&...ts) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    // Add function call object in case it hasn't happened for this launch yet
+    if (overall_launch_counter <= slice_launch_counter) {
+      /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+      if (overall_launch_counter <= slice_launch_counter) {
+        function_calls.emplace_back(current_slices, true, *executor_wrapper);
+        overall_launch_counter = function_calls.size();
+        return function_calls[slice_launch_counter].wrap_async(
+            last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+      }
+    }
+
+    return function_calls[slice_launch_counter].wrap_async(
+        last_stream_launch_done, std::forward<F>(f), std::forward<Ts>(ts)...);
+  }
+
+  bool slice_available(void) {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    return !slices_exhausted;
+  }
+
+  std::optional<hpx::lcos::future<executor_slice>> request_executor_slice() {
+    std::lock_guard<aggregation_mutex_t> guard(mut);
+    if (!slices_exhausted) {
+      const size_t local_slice_id = ++current_slices;
+      if (local_slice_id == 1) {
+        // Cleanup leftovers from last run if any
+        // TODO still required? Should be clean here already
+        function_calls.clear();
+        overall_launch_counter = 0;
+        std::lock_guard<aggregation_mutex_t> guard(buffer_mut);
+#ifndef NDEBUG
+        for (const auto &buffer_entry : buffer_allocations) {
+          const auto &[buffer_pointer_any, buffer_size,
+                       buffer_allocation_counter, valid, location_id, device_id] =
+              buffer_entry;
+          assert(!valid);
+        }
+#endif 
+        buffer_allocations.clear();
+        buffer_allocations_map.clear();
+        buffer_counter = 0;
+
+        assert(executor_slices_alive == false);
+        assert(buffers_in_use == false);
+        executor_slices_alive = true;
+        buffers_in_use = false;
+        dealloc_counter = 0;
+
+        if (mode == aggregated_executor_modes::STRICT ) {
+          slices_full_promise = hpx::lcos::local::promise<void>{};
+        }
+      }
+
+      // Create Executor Slice future -- that will be returned later
+      hpx::lcos::future<executor_slice> ret_fut;
+      if (local_slice_id < max_slices) {
+        executor_slices.emplace_back(hpx::lcos::local::promise<executor_slice>{});
+        ret_fut =
+            executor_slices[local_slice_id - 1].get_future();
+      } else {
+        launched_slices = current_slices;
+        ret_fut = hpx::make_ready_future(executor_slice{*this,
+            executor_slices.size(), launched_slices});
+      }
+
+      // Are we the first slice? If yes, add continuation set the
+      // executor_slice
+      // futures to ready if the launch conditions are met
+      if (local_slice_id == 1) {
+        // Redraw executor
+        assert(!executor_wrapper);
+        cppuddle::executor_recycling::executor_pool::select_device<
+            Executor, cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(
+            gpu_id);
+        executor_wrapper.reset(
+            new cppuddle::executor_recycling::executor_interface<
+                Executor,
+                cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(
+                gpu_id));
+        // Renew promise that all slices will be ready as the primary launch
+        // criteria...
+        hpx::lcos::shared_future<void> fut;
+        if (mode == aggregated_executor_modes::EAGER ||
+            mode == aggregated_executor_modes::ENDLESS) {
+          // Fallback launch condidtion: Launch as soon as the underlying stream
+          // is ready
+          /* auto slices_full_fut = slices_full_promise.get_future(); */
+          cppuddle::executor_recycling::executor_pool::select_device<
+              Executor,
+              cppuddle::executor_recycling::round_robin_pool_impl<Executor>>(gpu_id);
+          auto exec_fut = (*executor_wrapper).get_future();
+          /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */
+          fut = std::move(exec_fut);
+        } else {
+          auto slices_full_fut = slices_full_promise.get_shared_future();
+          // Just use the slices launch condition
+          fut = std::move(slices_full_fut);
+        }
+        // Launch all executor slices within this continuation
+        current_continuation = fut.then([this](auto &&fut) {
+          std::lock_guard<aggregation_mutex_t> guard(mut);
+          slices_exhausted = true;
+          launched_slices = current_slices;
+          size_t id = 0;
+          for (auto &slice_promise : executor_slices) {
+            slice_promise.set_value(
+                executor_slice{*this, id, launched_slices});
+            id++;
+          }
+          executor_slices.clear();
+        });
+      }
+      if (local_slice_id >= max_slices &&
+          mode != aggregated_executor_modes::ENDLESS) {
+        slices_exhausted = true; // prevents any more threads from entering
+                                 // before the continuation is launched
+        /* launched_slices = current_slices; */
+        /* size_t id = 0; */
+        /* for (auto &slice_promise : executor_slices) { */
+        /*   slice_promise.set_value( */
+        /*       executor_slice{*this, id, launched_slices}); */
+        /*   id++; */
+        /* } */
+        /* executor_slices.clear(); */
+        if (mode == aggregated_executor_modes::STRICT ) {
+          slices_full_promise.set_value(); // Trigger slices launch condition continuation 
+        }
+        // that continuation will set all executor slices so far handed out to ready
+      }
+      return ret_fut;
+    } else {
+      // Return empty optional as failure
+      return std::optional<hpx::lcos::future<executor_slice>>{};
+    }
+  }
+  size_t launched_slices;
+  void reduce_usage_counter(void) {
+    /* std::lock_guard<aggregation_mutex_t> guard(mut); */
+    assert(slices_exhausted == true);
+    assert(executor_wrapper);
+    assert(executor_slices_alive == true);
+    assert(launched_slices >= 1);
+    assert(current_slices >= 0 && current_slices <= launched_slices);
+    const size_t local_slice_id = --current_slices;
+    // Last slice goes out scope?
+    if (local_slice_id == 0) {
+      // Mark executor fit for reusage
+      std::lock_guard<aggregation_mutex_t> guard(mut);
+      executor_slices_alive = false; 
+      if (!executor_slices_alive && !buffers_in_use) {
+        // Release executor
+        slices_exhausted = false;
+        executor_wrapper.reset(nullptr);
+      }
+    }
+  }
+  ~aggregated_executor(void) {
+
+    assert(current_slices == 0);
+    assert(executor_slices_alive == false);
+    assert(buffers_in_use == false);
+
+    if (mode != aggregated_executor_modes::STRICT ) {
+        slices_full_promise.set_value(); // Trigger slices launch condition continuation 
+    }
+
+    // Cleanup leftovers from last run if any
+    function_calls.clear();
+    overall_launch_counter = 0;
+#ifndef NDEBUG
+    for (const auto &buffer_entry : buffer_allocations) {
+      const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter,
+                   valid, location_id, device_id] = buffer_entry;
+      assert(!valid);
+    }
+#endif
+    buffer_allocations.clear();
+    buffer_allocations_map.clear();
+    buffer_counter = 0;
+
+    assert(buffer_allocations.empty());
+    assert(buffer_allocations_map.empty());
+  }
+
+  aggregated_executor(const size_t number_slices,
+                      aggregated_executor_modes mode, const size_t gpu_id = 0)
+      : max_slices(number_slices), current_slices(0), slices_exhausted(false),
+        dealloc_counter(0), mode(mode), executor_slices_alive(false),
+        buffers_in_use(false), gpu_id(gpu_id),
+        executor_wrapper(nullptr),
+        current_continuation(hpx::make_ready_future()),
+        last_stream_launch_done(hpx::make_ready_future()) {}
+  // Not meant to be copied or moved
+  aggregated_executor(const aggregated_executor &other) = delete;
+  aggregated_executor &operator=(const aggregated_executor &other) = delete;
+  aggregated_executor(aggregated_executor &&other) = delete;
+  aggregated_executor &operator=(aggregated_executor &&other) = delete;
+};
+
+template <typename T, typename Host_Allocator, typename Executor>
+class allocator_slice {
+private:
+  typename aggregated_executor<Executor>::executor_slice &executor_reference;
+  aggregated_executor<Executor> &executor_parent;
+
+public:
+  using value_type = T;
+  allocator_slice(
+      typename aggregated_executor<Executor>::executor_slice &executor)
+      : executor_reference(executor), executor_parent(executor.parent) {}
+  template <typename U>
+  explicit allocator_slice(
+      allocator_slice<U, Host_Allocator, Executor> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data = executor_reference.template get<T, Host_Allocator>(n);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    /* executor_reference.template mark_unused<T, Host_Allocator>(p, n); */
+    executor_parent.template mark_unused<T, Host_Allocator>(p, n);
+  }
+  template <typename... Args>
+  inline void construct(T *p, Args... args) noexcept {
+    // Do nothing here - we reuse the content of the last owner
+  }
+  void destroy(T *p) {
+    // Do nothing here - Contents will be destroyed when the buffer manager is
+    // destroyed, not before
+  }
+};
+template <typename T, typename U, typename Host_Allocator, typename Executor>
+constexpr bool
+operator==(allocator_slice<T, Host_Allocator, Executor> const &,
+           allocator_slice<U, Host_Allocator, Executor> const &) noexcept {
+  return false;
+}
+template <typename T, typename U, typename Host_Allocator, typename Executor>
+constexpr bool
+operator!=(allocator_slice<T, Host_Allocator, Executor> const &,
+           allocator_slice<U, Host_Allocator, Executor> const &) noexcept {
+  return true;
+}
+
+} // namespace detail
+} // namespace kernel_aggregation
+} // namespace cppuddle
+
+
+
+namespace hpx { namespace parallel { namespace execution {
+   // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with 
+   // compatible unlying executor types
+    /* template<typename E> */
+    /* struct is_one_way_executor<typename aggregated_executor<E>::executor_slice> */
+    /*   : std::true_type */
+    /* {}; */
+    /* template<typename E> */
+    /* struct is_two_way_executor<typename aggregated_executor<E>::executor_slice> */
+    /*   : std::true_type */
+    /* {}; */
+
+#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
+    // Workaround for the meantime: Manually create traits for compatible types:
+template <>
+struct is_one_way_executor<
+    typename cppuddle::kernel_aggregation::detail::aggregated_executor<
+        hpx::cuda::experimental::cuda_executor>::executor_slice>
+    : std::true_type {};
+template <>
+struct is_two_way_executor<
+    typename cppuddle::kernel_aggregation::detail::aggregated_executor<
+        hpx::cuda::experimental::cuda_executor>::executor_slice>
+    : std::true_type {};
+#endif
+}}}
+
+#endif
diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp
new file mode 100644
index 00000000..c7a3b633
--- /dev/null
+++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef KERNEL_AGGREGATION_INTERFACE_HPP
+#define KERNEL_AGGREGATION_INTERFACE_HPP
+
+#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp"
+#include "cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp"
+
+namespace cppuddle {
+namespace kernel_aggregation {
+
+using aggregated_executor_modes =
+    cppuddle::kernel_aggregation::detail::aggregated_executor_modes;
+
+template <typename T, typename Host_Allocator, typename Executor>
+using allocator_slice =
+    cppuddle::kernel_aggregation::detail::allocator_slice<T, Host_Allocator, Executor>;
+
+template <typename Executor>
+using aggregated_executor =
+    cppuddle::kernel_aggregation::detail::aggregated_executor<Executor>;
+
+template <const char *kernelname, class Interface, class Pool>
+using aggregation_pool =
+    cppuddle::kernel_aggregation::detail::aggregation_pool<kernelname, Interface,
+    Pool>;
+
+} // namespace kernel_aggregation 
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
new file mode 100644
index 00000000..a824e7e0
--- /dev/null
+++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP
+#define ALIGNED_RECYCLING_ALLOCATORS_HPP
+
+#include <boost/align/aligned_allocator.hpp>
+#include "buffer_management_interface.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+namespace device_selection {
+template <typename T, size_t alignement>
+/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
+/// select_device_functor does not compile for > 1 GPU (to make sure all /
+/// relevant allocators support multigpu)
+struct select_device_functor<
+    T, boost::alignment::aligned_allocator<T, alignement>> {
+  void operator()(const size_t device_id) {}
+};
+} // namespace device_selection
+
+/// Recycling allocator for boost aligned memory
+template <typename T, std::size_t alignement,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_aligned = detail::recycle_allocator<
+    T, boost::alignment::aligned_allocator<T, alignement>>;
+/// Recycling allocator for boost aligned memory (reusing previous content as well)
+template <typename T, std::size_t alignement,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using aggressive_recycle_aligned = detail::aggressive_recycle_allocator<
+    T, boost::alignment::aligned_allocator<T, alignement>>;
+
+} // namespace memory_recycling
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp
new file mode 100644
index 00000000..c5fa44cd
--- /dev/null
+++ b/include/cppuddle/memory_recycling/buffer_management_interface.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BUFFER_MANAGEMENT_INTERFACE_HPP
+#define BUFFER_MANAGEMENT_INTERFACE_HPP
+
+#include "detail/buffer_management.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+/// Print performance counters of all buffer managers to stdout
+inline void print_buffer_counters() {
+  detail::buffer_interface::print_performance_counters();
+}
+/// Deletes all buffers (even ones still marked as used), delete the buffer 
+/// managers and the recycler itself
+inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); }
+
+/// Deletes all buffers currently marked as unused
+inline void unused_buffer_cleanup() {
+  detail::buffer_interface::clean_unused_buffers();
+}
+/// Deletes all buffers (even ones still marked as used), delete the buffer 
+/// managers and the recycler itself. Disallows further usage.
+inline void finalize() { detail::buffer_interface::finalize(); }
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
new file mode 100644
index 00000000..b47a4fe2
--- /dev/null
+++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_RECYCLING_ALLOCATORS_HPP
+#define CUDA_RECYCLING_ALLOCATORS_HPP
+
+#include "buffer_management_interface.hpp"
+// import cuda_pinned_allocator and cuda_device_allocator
+#include "detail/cuda_underlying_allocators.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+// Tell cppuddle how to select the device for the cuda allocators
+namespace device_selection {
+/// GPU device selector using the CUDA API for pinned host allocations
+template <typename T>
+struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
+};
+/// GPU selector using the CUDA API for pinned host allocations
+template <typename T>
+struct select_device_functor<T, detail::cuda_device_allocator<T>> {
+  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
+};
+} // namespace device_selection
+
+/// Recycling allocator for CUDA pinned host memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_cuda_host =
+    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>;
+/// Recycling allocator for CUDA device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_cuda_device =
+    detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
new file mode 100644
index 00000000..7c30c781
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp
@@ -0,0 +1,929 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BUFFER_MANAGEMENT_HPP
+#define BUFFER_MANAGEMENT_HPP
+
+#include <atomic>
+#include <cassert>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+
+// Warn about suboptimal performance without correct HPX-aware allocators
+#ifdef CPPUDDLE_HAVE_HPX
+#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+#pragma message                                                                \
+"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
+For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
+#else
+// include runtime to get HPX thread IDs required for the HPX-aware allocators
+#include <hpx/include/runtime.hpp>
+#endif
+#endif
+
+#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
+// For builds with The HPX mutex
+#include <hpx/mutex.hpp>
+#endif
+
+#ifdef CPPUDDLE_HAVE_COUNTERS
+#include <boost/core/demangle.hpp>
+#if defined(CPPUDDLE_HAVE_HPX)
+#include <hpx/include/performance_counters.hpp>
+#endif
+#endif
+
+#include "cppuddle/common/config.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+namespace device_selection {
+/// Default device selector - No MultGPU support
+/** Throws a runtime error if max_number_gpus > 1 (defined by cmake variable
+ * CPPUDDLE_WITH_MAX_NUMBER_GPUS). Needs to be specialized for an allocator to
+ * provide MultiGPU support (see CPPuddle CUDA/HIP allocators for examples) **/
+template <typename T, typename Allocator> struct select_device_functor {
+  void operator()(const size_t device_id) {
+    if constexpr (max_number_gpus > 1)
+      throw std::runtime_error(
+          "Allocators used in Multi-GPU builds need explicit Multi-GPU support "
+          "(by having a select_device_functor overload");
+  }
+};
+} // namespace device_selection
+
+namespace detail {
+
+/// Singleton interface to all buffer_managers
+class buffer_interface {
+public:
+#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
+
+// Warn about suboptimal performance without recycling
+#pragma message                                                                \
+"Warning: Building without buffer recycling! Use only for performance testing! \
+For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
+
+  template <typename T, typename Host_Allocator>
+  static T *get(size_t number_elements, bool manage_content_lifetime = false,
+      std::optional<size_t> location_hint = std::nullopt, 
+      std::optional<size_t> device_id = std::nullopt) {
+
+    return Host_Allocator{}.allocate(number_elements);
+  }
+  /// Marks an buffer as unused and fit for reusage
+  template <typename T, typename Host_Allocator>
+  static void mark_unused(T *p, size_t number_elements,
+      std::optional<size_t> location_hint = std::nullopt,
+      std::optional<size_t> device_id = std::nullopt) {
+    return Host_Allocator{}.deallocate(p, number_elements);
+  }
+#else
+  /// Primary method to allocate a buffer with CPPuddle: Returns and allocated /
+  /// buffer of the requested size - this may be a reused buffer. The method
+  /// figures out the correct buffer_manager and gets such a buffer from it.
+  /// Should be called from an allocator implementation, not directly
+  template <typename T, typename Host_Allocator>
+  static T *get(size_t number_elements, bool manage_content_lifetime = false,
+                std::optional<size_t> location_hint = std::nullopt,
+                std::optional<size_t> device_id = std::nullopt) {
+    try {
+      return buffer_manager<T, Host_Allocator>::get(
+          number_elements, manage_content_lifetime, location_hint, device_id);
+    } catch (const std::exception &exc) {
+      std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
+      std::cerr << "Rethrowing exception... " << std::endl;;
+      throw;
+    }
+  }
+  /// Primary method to deallocate a buffer with CPPuddle:Marks an buffer as /
+  /// unused and fit for reusage. The method figures out the correct buffer
+  /// manager and marks the buffer there. Should be called from an allocator
+  /// implementation, not directly
+  template <typename T, typename Host_Allocator>
+  static void mark_unused(T *p, size_t number_elements,
+                          std::optional<size_t> location_hint = std::nullopt,
+                          std::optional<size_t> device_id = std::nullopt) {
+    try {
+      return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
+          location_hint, device_id);
+    } catch (const std::exception &exc) {
+      std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
+      std::cerr << "Rethrowing exception... " << std::endl;;
+      throw;
+    }
+  }
+#endif
+  /// Register all CPPuddle counters as HPX performance counters
+  template <typename T, typename Host_Allocator>
+    static void register_allocator_counters_with_hpx(void) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
+#else
+    std::cerr << "Warning: Trying to register allocator performance counters "
+                 "with HPX but CPPuddle was built "
+                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
+              << std::endl;
+#endif
+  }
+
+  /// Deallocate all buffers, no matter whether they are marked as used or not
+  static void clean_all() {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    for (const auto &clean_function :
+         instance().total_cleanup_callbacks) {
+      clean_function();
+    }
+  }
+  /// Deallocated all currently unused buffer
+  static void clean_unused_buffers() {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    for (const auto &clean_function :
+         instance().partial_cleanup_callbacks) {
+      clean_function();
+    }
+  }
+  /// Deallocate all buffers, no matter whether they are marked as used or not
+  static void finalize() {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    for (const auto &finalize_function :
+         instance().finalize_callbacks) {
+      finalize_function();
+    }
+  }
+
+  static void print_performance_counters() {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    for (const auto &print_function :
+         instance().print_callbacks) {
+      print_function();
+    }
+#else
+    std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built "
+                 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
+              << std::endl;
+#endif
+  }
+
+  // Member variables and methods
+private:
+
+  /// Singleton instance access
+  static buffer_interface& instance() {
+    static buffer_interface singleton{};
+    return singleton;
+  }
+  /// Callbacks for printing the performance counter data
+  std::list<std::function<void()>> print_callbacks;
+  /// Callbacks for buffer_manager finalize - each callback completely destroys
+  /// one buffer_manager
+  std::list<std::function<void()>> finalize_callbacks;
+  /// Callbacks for buffer_manager cleanups - each callback destroys all buffers within 
+  /// one buffer_manager, both used and unsued
+  std::list<std::function<void()>> total_cleanup_callbacks;
+  /// Callbacks for partial buffer_manager cleanups - each callback deallocates
+  /// all unused buffers of a manager
+  std::list<std::function<void()>> partial_cleanup_callbacks;
+  /// default, private constructor - not automatically constructed due to the
+  /// deleted constructors
+  buffer_interface() = default;
+
+  mutex_t callback_protection_mut;
+  /// Add a callback function that gets executed upon cleanup and destruction
+  static void add_total_cleanup_callback(const std::function<void()> &func) {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    instance().total_cleanup_callbacks.push_back(func);
+  }
+  /// Add a callback function that gets executed upon partial (unused memory)
+  /// cleanup
+  static void add_partial_cleanup_callback(const std::function<void()> &func) {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    instance().partial_cleanup_callbacks.push_back(func);
+  }
+  /// Add a callback function that gets executed upon partial (unused memory)
+  /// cleanup
+  static void add_finalize_callback(const std::function<void()> &func) {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    instance().finalize_callbacks.push_back(func);
+  }
+  /// Add a callback function that gets executed upon partial (unused memory)
+  /// cleanup
+  static void add_print_callback(const std::function<void()> &func) {
+    std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
+    instance().print_callbacks.push_back(func);
+  }
+
+public:
+  ~buffer_interface() = default; 
+
+  // Subclasses
+private:
+  /// Memory Manager subclass to handle buffers a specific type
+  template <typename T, typename Host_Allocator> class buffer_manager {
+  private:
+    // Tuple content: Pointer to buffer, buffer_size, location ID, Flag
+    // The flag at the end controls whether to buffer content is to be reused as
+    // well
+    using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
+
+
+  public:
+    /// Cleanup and delete this singleton
+    static void clean() {
+      assert(instance() && !is_finalized);
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
+        std::lock_guard<mutex_t> guard(instance()[i].mut);
+        instance()[i].clean_all_buffers();
+      }
+    }
+    static void print_performance_counters() {
+      assert(instance() && !is_finalized);
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
+        std::lock_guard<mutex_t> guard(instance()[i].mut);
+        instance()[i].print_counters();
+      }
+    }
+    static void finalize() {
+      assert(instance() && !is_finalized);
+      is_finalized = true;
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
+        std::lock_guard<mutex_t> guard(instance()[i].mut);
+        instance()[i].clean_all_buffers();
+      }
+      instance().reset();
+    }
+    /// Cleanup all buffers not currently in use
+    static void clean_unused_buffers_only() {
+      assert(instance() && !is_finalized);
+      for (auto i = 0; i < number_instances * max_number_gpus; i++) {
+        std::lock_guard<mutex_t> guard(instance()[i].mut);
+        for (auto &buffer_tuple : instance()[i].unused_buffer_list) {
+          Host_Allocator alloc;
+          if (std::get<3>(buffer_tuple)) {
+            std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+          }
+          alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+        }
+        instance()[i].unused_buffer_list.clear();
+      }
+    }
+#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
+    static size_t get_sum_number_recycling(bool reset) {
+      if (reset)
+        sum_number_recycling = 0;
+      return sum_number_recycling;
+    }
+    static size_t get_sum_number_allocation(bool reset) {
+      if (reset)
+        sum_number_allocation = 0;
+      return sum_number_allocation;
+    }
+    static size_t get_sum_number_creation(bool reset) {
+      if (reset)
+        sum_number_creation = 0;
+      return sum_number_creation;
+    }
+    static size_t get_sum_number_deallocation(bool reset) {
+      if (reset)
+        sum_number_deallocation = 0;
+      return sum_number_deallocation;
+    }
+    static size_t get_sum_number_wrong_hints(bool reset) {
+      if (reset)
+        sum_number_wrong_hints = 0;
+      return sum_number_wrong_hints;
+    }
+    static size_t get_sum_number_wrong_device_hints(bool reset) {
+      if (reset)
+        sum_number_wrong_hints = 0;
+      return sum_number_wrong_device_hints;
+    }
+    static size_t get_sum_number_bad_allocs(bool reset) {
+      if (reset)
+        sum_number_bad_allocs = 0;
+      return sum_number_bad_allocs;
+    }
+
+    static void register_counters_with_hpx(void) {
+      std::string alloc_name =
+          boost::core::demangle(typeid(Host_Allocator).name()) +
+          std::string("_") + boost::core::demangle(typeid(T).name());
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"),
+          &get_sum_number_recycling,
+          "Number of allocations using a recycled buffer with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"),
+          &get_sum_number_allocation,
+          "Number of allocations with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"),
+          &get_sum_number_creation,
+          "Number of allocations not using a recycled buffer with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"),
+          &get_sum_number_deallocation,
+          "Number of deallocations yielding buffers to be recycled with this "
+          "allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
+          &get_sum_number_wrong_hints,
+          "Number of wrong hints supplied to the dealloc method with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"),
+          &get_sum_number_wrong_device_hints,
+          "Number of wrong device hints supplied to the dealloc method with this allocator");
+      hpx::performance_counters::install_counter_type(
+          std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
+          &get_sum_number_bad_allocs,
+          "Number of wrong bad allocs which triggered a cleanup of unused buffers");
+    }
+#endif
+
+    /// Tries to recycle or create a buffer of type T and size number_elements.
+    static T *get(size_t number_of_elements, bool manage_content_lifetime,
+        std::optional<size_t> location_hint = std::nullopt,
+        std::optional<size_t> gpu_device_id = std::nullopt) {
+      init_callbacks_once();
+      if (is_finalized) {
+        throw std::runtime_error("Tried allocation after finalization");
+      }
+      assert(instance() && !is_finalized);
+
+      size_t location_id = 0;
+      if (location_hint) {
+        location_id = *location_hint;
+      }
+      if (location_id >= number_instances) {
+        throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
+      }
+      size_t device_id = 0;
+      if (gpu_device_id) {
+        device_id = *gpu_device_id;
+      }
+      if (device_id >= max_number_gpus) {
+        throw std::runtime_error("Tried to create buffer with invalid device id [get]! "
+                                 "Is multigpu support enabled with the correct number "
+                                 "of GPUs?");
+      }
+
+      location_id = location_id + device_id * number_instances;
+      std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+
+
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      instance()[location_id].number_allocation++;
+      sum_number_allocation++;
+#endif
+      // Check for unused buffers we can recycle:
+      for (auto iter = instance()[location_id].unused_buffer_list.begin();
+           iter != instance()[location_id].unused_buffer_list.end(); iter++) {
+        auto tuple = *iter;
+        if (std::get<1>(tuple) == number_of_elements) {
+          instance()[location_id].unused_buffer_list.erase(iter);
+
+          // handle the switch from aggressive to non aggressive reusage (or
+          // vice-versa)
+          if (manage_content_lifetime && !std::get<3>(tuple)) {
+            std::uninitialized_value_construct_n(std::get<0>(tuple),
+                                                  number_of_elements);
+            std::get<3>(tuple) = true;
+          } else if (!manage_content_lifetime && std::get<3>(tuple)) {
+            std::destroy_n(std::get<0>(tuple), std::get<1>(tuple));
+            std::get<3>(tuple) = false;
+          }
+          instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
+#ifdef CPPUDDLE_HAVE_COUNTERS
+          instance()[location_id].number_recycling++;
+          sum_number_recycling++;
+#endif
+          return std::get<0>(tuple);
+        }
+      }
+
+      // No unused buffer found -> Create new one and return it
+      try {
+        cppuddle::memory_recycling::device_selection::select_device_functor<
+            T, Host_Allocator>{}(device_id);
+        Host_Allocator alloc;
+        T *buffer = alloc.allocate(number_of_elements);
+        instance()[location_id].buffer_map.insert(
+            {buffer, std::make_tuple(buffer, number_of_elements, 1,
+                                     manage_content_lifetime)});
+#ifdef CPPUDDLE_HAVE_COUNTERS
+        instance()[location_id].number_creation++;
+        sum_number_creation++;
+#endif
+        if (manage_content_lifetime) {
+          std::uninitialized_value_construct_n(buffer, number_of_elements);
+        }
+        return buffer;
+      } catch (std::bad_alloc &e) {
+        // not enough memory left! Cleanup and attempt again:
+        std::cerr 
+          << "Not enough memory left. Cleaning up unused buffers now..." 
+          << std::endl;
+        buffer_interface::clean_unused_buffers();
+        std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
+
+        // If there still isn't enough memory left, the caller has to handle it
+        // We've done all we can in here
+        Host_Allocator alloc;
+        cppuddle::memory_recycling::device_selection::select_device_functor<
+            T, Host_Allocator>{}(device_id);
+        T *buffer = alloc.allocate(number_of_elements);
+        instance()[location_id].buffer_map.insert(
+            {buffer, std::make_tuple(buffer, number_of_elements, 1,
+                                     manage_content_lifetime)});
+#ifdef CPPUDDLE_HAVE_COUNTERS
+        instance()[location_id].number_creation++;
+        sum_number_creation++;
+        instance()[location_id].number_bad_alloc++;
+        sum_number_bad_allocs++;
+#endif
+        std::cerr << "Second attempt allocation successful!" << std::endl;
+        if (manage_content_lifetime) {
+          std::uninitialized_value_construct_n(buffer, number_of_elements);
+        }
+        return buffer;
+      }
+    }
+
+    static void mark_unused(T *memory_location, size_t number_of_elements,
+        std::optional<size_t> location_hint = std::nullopt,
+        std::optional<size_t> device_hint = std::nullopt) {
+      if (is_finalized)
+        return;
+      assert(instance() && !is_finalized);
+
+      size_t location_id = 0;
+      if (location_hint) {
+        location_id = *location_hint;
+        if (location_id >= number_instances) {
+          throw std::runtime_error(
+              "Buffer recylcer received invalid location hint [mark_unused]");
+        }
+      }
+      size_t device_id = 0;
+      if (device_hint) {
+        device_id = *device_hint;
+        if (device_id >= max_number_gpus) {
+          throw std::runtime_error(
+              "Buffer recylcer received invalid devce hint [mark_unused]");
+        }
+      }
+
+      // Attempt 1 to find the correct bucket/location: Look at provided hint:
+      if (location_hint) {
+        size_t location_id = location_hint.value() + device_id * number_instances;
+        std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+        if (instance()[location_id].buffer_map.find(memory_location) !=
+            instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+          instance()[location_id].number_deallocation++;
+          sum_number_deallocation++;
+#endif
+          auto it = instance()[location_id].buffer_map.find(memory_location);
+          assert(it != instance()[location_id].buffer_map.end());
+          auto &tuple = it->second;
+          // sanity checks:
+          assert(std::get<1>(tuple) == number_of_elements);
+          // move to the unused_buffer list
+          instance()[location_id].unused_buffer_list.push_front(tuple);
+          instance()[location_id].buffer_map.erase(memory_location);
+          return; // Success
+        }
+        // hint was wrong 
+#ifdef CPPUDDLE_HAVE_COUNTERS
+        instance()[location_id].number_wrong_hints++;
+        sum_number_wrong_hints++;
+#endif
+      }
+      // Failed to find buffer in the specified localtion/device!
+      // Attempt 2 - Look for buffer other locations on the same device...
+      for (size_t location_id = device_id * number_instances;
+           location_id < (device_id + 1) * number_instances; location_id++) {
+        if (location_hint) {
+          if (*location_hint + device_id * max_number_gpus  == location_id) {
+            continue; // already tried this -> skip
+          }
+        }
+        std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+        if (instance()[location_id].buffer_map.find(memory_location) !=
+            instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+          instance()[location_id].number_deallocation++;
+          sum_number_deallocation++;
+#endif
+          auto it = instance()[location_id].buffer_map.find(memory_location);
+          assert(it != instance()[location_id].buffer_map.end());
+          auto &tuple = it->second;
+          // sanity checks:
+          assert(std::get<1>(tuple) == number_of_elements);
+          // move to the unused_buffer list
+          instance()[location_id].unused_buffer_list.push_front(tuple);
+          instance()[location_id].buffer_map.erase(memory_location);
+          return; // Success
+        }
+      }
+      // device hint was wrong 
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      if (device_hint) {
+        sum_number_wrong_device_hints++;
+      }
+#endif
+      // Failed to find buffer on the specified device!
+      // Attempt 3 - Look for buffer on other devices...
+      for (size_t local_device_id = 0; local_device_id < max_number_gpus;
+           local_device_id++) {
+        if (local_device_id == device_id)
+          continue; // aldready tried this device
+
+        // Try hint localtion first yet again (though on different device)
+        if (location_hint) {
+          size_t location_id = location_hint.value() + local_device_id * number_instances;
+          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+          if (instance()[location_id].buffer_map.find(memory_location) !=
+              instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+            instance()[location_id].number_deallocation++;
+            sum_number_deallocation++;
+#endif
+            auto it = instance()[location_id].buffer_map.find(memory_location);
+            assert(it != instance()[location_id].buffer_map.end());
+            auto &tuple = it->second;
+            // sanity checks:
+            assert(std::get<1>(tuple) == number_of_elements);
+            // move to the unused_buffer list
+            instance()[location_id].unused_buffer_list.push_front(tuple);
+            instance()[location_id].buffer_map.erase(memory_location);
+            return; // Success
+          }
+        }
+        // Failed - check all other localtions on device
+        for (size_t location_id = local_device_id * number_instances;
+             location_id < (local_device_id + 1) * number_instances; location_id++) {
+          if (location_hint) {
+            if (*location_hint + local_device_id * max_number_gpus == location_id) {
+              continue; // already tried this -> skip
+            }
+          }
+          std::lock_guard<mutex_t> guard(instance()[location_id].mut);
+          if (instance()[location_id].buffer_map.find(memory_location) !=
+              instance()[location_id].buffer_map.end()) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+            instance()[location_id].number_deallocation++;
+            sum_number_deallocation++;
+#endif
+            auto it = instance()[location_id].buffer_map.find(memory_location);
+            assert(it != instance()[location_id].buffer_map.end());
+            auto &tuple = it->second;
+            // sanity checks:
+            assert(std::get<1>(tuple) == number_of_elements);
+            // move to the unused_buffer list
+            instance()[location_id].unused_buffer_list.push_front(tuple);
+            instance()[location_id].buffer_map.erase(memory_location);
+            return; // Success
+          }
+        }
+      }
+      // Buffer that is to be deleted is nowhere to be found - we looked everywhere!
+      // =>
+      // Failure! Handle here...
+
+      // TODO Throw exception instead in the futures, as soon as the recycler finalize is 
+      // in all user codes
+      /* throw std::runtime_error("Tried to delete non-existing buffer"); */
+
+      // This is odd: Print warning -- however, might also happen with static
+      // buffers using these allocators IF the new finalize was not called. For
+      // now, print warning until all user-code is upgraded to the finalize method.
+      // This allows using current versions of cppuddle with older application code
+      std::cerr
+          << "Warning! Tried to delete non-existing buffer within CPPuddle!"
+          << std::endl;
+      std::cerr << "Did you forget to call recycler::finalize?" << std::endl;
+      }
+
+  private:
+    /// List with all buffers still in usage
+    std::unordered_map<T *, buffer_entry_type> buffer_map{};
+    /// List with all buffers currently not used
+    std::list<buffer_entry_type> unused_buffer_list{};
+    /// Access control
+    mutex_t mut;
+#ifdef CPPUDDLE_HAVE_COUNTERS
+    /// Performance counters
+    size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
+        number_recycling{0}, number_creation{0}, number_bad_alloc{0};
+
+    static inline std::atomic<size_t> sum_number_allocation{0},
+        sum_number_deallocation{0}, sum_number_wrong_hints{0},
+        sum_number_wrong_device_hints{0}, sum_number_recycling{0},
+        sum_number_creation{0}, sum_number_bad_allocs{0};
+#endif
+        /// default, private constructor - not automatically constructed due to
+        /// the deleted constructors
+        buffer_manager() = default;
+    buffer_manager&
+    operator=(buffer_manager<T, Host_Allocator> const &other) = default;
+    buffer_manager&
+    operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
+    static std::unique_ptr<buffer_manager[]>& instance(void) {
+      static std::unique_ptr<buffer_manager[]> instances{
+          new buffer_manager[number_instances * max_number_gpus]};
+      return instances;
+    }
+    static void init_callbacks_once(void) {
+      assert(instance());
+#if defined(CPPUDDLE_HAVE_HPX)  && defined(CPPUDDLE_HAVE_HPX_MUTEX)
+      static hpx::once_flag flag; 
+      hpx::call_once(flag, []() {
+#else
+      static std::once_flag flag; 
+      std::call_once(flag, []() {
+#endif
+        is_finalized = false;
+        buffer_interface::add_total_cleanup_callback(clean);
+        buffer_interface::add_partial_cleanup_callback(
+            clean_unused_buffers_only);
+        buffer_interface::add_finalize_callback(
+            finalize);
+#ifdef CPPUDDLE_HAVE_COUNTERS
+        buffer_interface::add_print_callback(
+            print_performance_counters);
+#endif
+          });
+    }
+    static inline std::atomic<bool> is_finalized;
+
+#ifdef CPPUDDLE_HAVE_COUNTERS
+    void print_counters(void) {
+      if (number_allocation == 0)
+        return;
+      // Print performance counters
+      size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
+      std::cout << "\nBuffer manager destructor for (Alloc: "
+                << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: "
+                << boost::core::demangle(typeid(T).name())
+                << "):" << std::endl
+                << "--------------------------------------------------------------------"
+                << std::endl
+                << "--> Number of bad_allocs that triggered garbage "
+                   "collection:       "
+                << number_bad_alloc << std::endl
+                << "--> Number of buffers that got requested from this "
+                   "manager:       "
+                << number_allocation << std::endl
+                << "--> Number of times an unused buffer got recycled for a "
+                   "request:  "
+                << number_recycling << std::endl
+                << "--> Number of times a new buffer had to be created for a "
+                   "request: "
+                << number_creation << std::endl
+                << "--> Number cleaned up buffers:                             "
+                   "       "
+                << number_cleaned << std::endl
+                << "--> Number wrong deallocation hints:                       "
+                   "       "
+                << number_wrong_hints << std::endl
+                << "--> Number of buffers that were marked as used upon "
+                   "cleanup:      "
+                << buffer_map.size() << std::endl
+                << "==> Recycle rate:                                          "
+                   "       "
+                << static_cast<float>(number_recycling) / number_allocation *
+                       100.0f
+                << "%" << std::endl;
+    }
+#endif
+
+    void clean_all_buffers(void) {
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      if (number_allocation == 0 && number_recycling == 0 &&
+          number_bad_alloc == 0 && number_creation == 0 &&
+          unused_buffer_list.empty() && buffer_map.empty()) {
+        return;
+      }
+#endif
+      for (auto &buffer_tuple : unused_buffer_list) {
+        Host_Allocator alloc;
+        if (std::get<3>(buffer_tuple)) {
+          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+        }
+        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+      }
+      for (auto &map_tuple : buffer_map) {
+        auto buffer_tuple = map_tuple.second;
+        Host_Allocator alloc;
+        if (std::get<3>(buffer_tuple)) {
+          std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+        }
+        alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
+      }
+      unused_buffer_list.clear();
+      buffer_map.clear();
+#ifdef CPPUDDLE_HAVE_COUNTERS
+      number_allocation = 0;
+      number_recycling = 0;
+      number_bad_alloc = 0;
+      number_creation = 0;
+      number_wrong_hints = 0;
+#endif
+    }
+  public:
+    ~buffer_manager() {
+      clean_all_buffers();
+    }
+
+  public: // Putting deleted constructors in public gives more useful error
+          // messages
+    // Bunch of constructors we don't need
+    buffer_manager(
+        buffer_manager<T, Host_Allocator> const &other) = delete;
+    buffer_manager(
+        buffer_manager<T, Host_Allocator> &&other) = delete;
+  };
+
+public:
+  // Putting deleted constructors in public gives more useful error messages
+  // Bunch of constructors we don't need
+  buffer_interface(buffer_interface const &other) = delete;
+  buffer_interface& operator=(buffer_interface const &other) = delete;
+  buffer_interface(buffer_interface &&other) = delete;
+  buffer_interface& operator=(buffer_interface &&other) = delete;
+};
+
+template <typename T, typename Host_Allocator> struct recycle_allocator {
+  using value_type = T;
+  using underlying_allocator_type = Host_Allocator;
+  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
+  const std::optional<size_t> dealloc_hint;
+  const std::optional<size_t> device_id;
+
+#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+  recycle_allocator() noexcept
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  explicit recycle_allocator(size_t hint) noexcept
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  explicit recycle_allocator(
+      recycle_allocator<T, Host_Allocator> const &other) noexcept
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  T *allocate(std::size_t n) {
+    T *data = buffer_interface::get<T, Host_Allocator>(n);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n);
+  }
+#else
+  recycle_allocator() noexcept
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
+  explicit recycle_allocator(const size_t device_id) noexcept
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
+  explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
+      : dealloc_hint(location_id), device_id(device_id) {}
+  explicit recycle_allocator(
+      recycle_allocator<T, Host_Allocator> const &other) noexcept
+  : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
+  T *allocate(std::size_t n) {
+    T *data = buffer_interface::get<T, Host_Allocator>(
+        n, false, hpx::get_worker_thread_num() % number_instances, device_id);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+                                                    device_id);
+  }
+#endif
+
+  template <typename... Args>
+  inline void construct(T *p, Args... args) noexcept {
+    ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
+  }
+  void destroy(T *p) { p->~T(); }
+};
+template <typename T, typename U, typename Host_Allocator>
+constexpr bool
+operator==(recycle_allocator<T, Host_Allocator> const &,
+           recycle_allocator<U, Host_Allocator> const &) noexcept {
+  if constexpr (std::is_same_v<T, U>)
+    return true;
+  else 
+    return false;
+}
+template <typename T, typename U, typename Host_Allocator>
+constexpr bool
+operator!=(recycle_allocator<T, Host_Allocator> const &,
+           recycle_allocator<U, Host_Allocator> const &) noexcept {
+  if constexpr (std::is_same_v<T, U>)
+    return false;
+  else 
+    return true;
+}
+
+/// Recycles not only allocations but also the contents of a buffer
+template <typename T, typename Host_Allocator>
+struct aggressive_recycle_allocator {
+  using value_type = T;
+  using underlying_allocator_type = Host_Allocator;
+  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
+  const std::optional<size_t> dealloc_hint;
+  const std::optional<size_t> device_id;
+
+#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
+  aggressive_recycle_allocator() noexcept
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  explicit aggressive_recycle_allocator(size_t hint) noexcept
+      : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  explicit aggressive_recycle_allocator(
+      aggressive_recycle_allocator<T, Host_Allocator> const &) noexcept 
+  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
+  T *allocate(std::size_t n) {
+    T *data = buffer_interface::get<T, Host_Allocator>(
+        n, true); // also initializes the buffer if it isn't reused
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n);
+  }
+#else
+  aggressive_recycle_allocator() noexcept
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
+  explicit aggressive_recycle_allocator(const size_t device_id) noexcept
+      : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
+  explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
+      : dealloc_hint(location_id), device_id(device_id) {}
+  explicit aggressive_recycle_allocator(
+      recycle_allocator<T, Host_Allocator> const &other) noexcept 
+    : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
+  T *allocate(std::size_t n) {
+    T *data = buffer_interface::get<T, Host_Allocator>(
+        n, true, dealloc_hint, device_id); // also initializes the buffer
+                                                // if it isn't reused
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
+                                                    device_id);
+  }
+#endif
+
+#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS
+  template <typename... Args>
+  inline void construct(T *p, Args... args) noexcept {
+    // Do nothing here - we reuse the content of the last owner
+  }
+  void destroy(T *p) {
+    // Do nothing here - Contents will be destroyed when the buffer manager is
+    // destroyed, not before
+  }
+#else
+// Warn about suboptimal performance without recycling
+#pragma message                                                                \
+"Warning: Building without content reusage for aggressive allocators! \
+For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !"
+  template <typename... Args>
+  inline void construct(T *p, Args... args) noexcept {
+    ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
+  }
+  void destroy(T *p) { p->~T(); }
+#endif
+};
+
+template <typename T, typename U, typename Host_Allocator>
+constexpr bool
+operator==(aggressive_recycle_allocator<T, Host_Allocator> const &,
+           aggressive_recycle_allocator<U, Host_Allocator> const &) noexcept {
+  if constexpr (std::is_same_v<T, U>)
+    return true;
+  else 
+    return false;
+}
+template <typename T, typename U, typename Host_Allocator>
+constexpr bool
+operator!=(aggressive_recycle_allocator<T, Host_Allocator> const &,
+           aggressive_recycle_allocator<U, Host_Allocator> const &) noexcept {
+  if constexpr (std::is_same_v<T, U>)
+    return false;
+  else 
+    return true;
+}
+} // namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp
new file mode 100644
index 00000000..ab1f8681
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp
@@ -0,0 +1,101 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_UNDERLYING_ALLOCATORS_HPP
+#define CUDA_UNDERLYING_ALLOCATORS_HPP
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for CUDA pinned memory
+template <class T> struct cuda_pinned_allocator {
+  using value_type = T;
+  cuda_pinned_allocator() noexcept = default;
+  template <class U>
+  explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    cudaError_t error =
+        cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_pinned_allocator failed due to cudaMallocHost failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    cudaError_t error = cudaFreeHost(p);
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_pinned_allocator failed due to cudaFreeHost failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+
+template <class T, class U>
+constexpr bool operator==(cuda_pinned_allocator<T> const &,
+                          cuda_pinned_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(cuda_pinned_allocator<T> const &,
+                          cuda_pinned_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for CUDA device memory
+template <class T> struct cuda_device_allocator {
+  using value_type = T;
+  cuda_device_allocator() noexcept = default;
+  template <class U>
+  explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    cudaError_t error = cudaMalloc(&data, n * sizeof(T));
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_device_allocator failed due to cudaMalloc failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    cudaError_t error = cudaFree(p);
+    if (error != cudaSuccess) {
+      std::string msg =
+          std::string(
+              "cuda_device_allocator failed due to cudaFree failure : ") +
+          std::string(cudaGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(cuda_device_allocator<T> const &,
+                          cuda_device_allocator<U> const &) noexcept {
+  return false;
+}
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
new file mode 100644
index 00000000..bfd7c2e1
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2021-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_UNDERLYING_ALLOCATORS_HPP
+#define HIP_UNDERLYING_ALLOCATORS_HPP
+
+#include <hip/hip_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for HIP pinned memory
+template <class T> struct hip_pinned_allocator {
+  using value_type = T;
+  hip_pinned_allocator() noexcept = default;
+  template <class U>
+  explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    // hipError_t error =
+    //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
+    
+    // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc 
+    // (not hipMallocHost) for async memcpys 
+    // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync
+    hipError_t error =
+        hipHostMalloc(reinterpret_cast<void **>(&data), n * sizeof(T));
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_pinned_allocator failed due to hipMallocHost failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    hipError_t error = hipHostFree(p);
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_pinned_allocator failed due to hipFreeHost failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(hip_pinned_allocator<T> const &,
+                          hip_pinned_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(hip_pinned_allocator<T> const &,
+                          hip_pinned_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for HIP device memory
+template <class T> struct hip_device_allocator {
+  using value_type = T;
+  hip_device_allocator() noexcept = default;
+  template <class U>
+  explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    T *data;
+    hipError_t error = hipMalloc(&data, n * sizeof(T));
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_device_allocator failed due to hipMalloc failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    hipError_t error = hipFree(p);
+    if (error != hipSuccess) {
+      std::string msg =
+          std::string(
+              "hip_device_allocator failed due to hipFree failure : ") +
+          std::string(hipGetErrorString(error));
+      throw std::runtime_error(msg);
+    }
+  }
+};
+template <class T, class U>
+constexpr bool operator==(hip_device_allocator<T> const &,
+                          hip_device_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(hip_device_allocator<T> const &,
+                          hip_device_allocator<U> const &) noexcept {
+  return false;
+}
+
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
new file mode 100644
index 00000000..3e3c9173
--- /dev/null
+++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2023-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef SYCL_UNDERLYING_ALLOCATORS_HPP
+#define SYCL_UNDERLYING_ALLOCATORS_HPP
+
+#include <CL/sycl.hpp>
+#include <stdexcept>
+#include <string>
+
+namespace cppuddle {
+namespace memory_recycling {
+namespace detail {
+/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{})
+template <class T> struct sycl_host_default_allocator {
+  using value_type = T;
+  sycl_host_default_allocator() noexcept = default;
+  template <class U>
+  explicit sycl_host_default_allocator(sycl_host_default_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    T *data = cl::sycl::malloc_host<T>(n, default_queue);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    cl::sycl::free(p, default_queue);
+  }
+};
+template <class T, class U>
+constexpr bool operator==(sycl_host_default_allocator<T> const &,
+                          sycl_host_default_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(sycl_host_default_allocator<T> const &,
+                          sycl_host_default_allocator<U> const &) noexcept {
+  return false;
+}
+
+/// Underlying allocator for SYCL device memory (using the sycl::default_selector{})
+template <class T> struct sycl_device_default_allocator {
+  using value_type = T;
+  sycl_device_default_allocator() noexcept = default;
+  template <class U>
+  explicit sycl_device_default_allocator(sycl_device_default_allocator<U> const &) noexcept {}
+  T *allocate(std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    T *data = cl::sycl::malloc_device<T>(n, default_queue);
+    return data;
+  }
+  void deallocate(T *p, std::size_t n) {
+    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
+    cl::sycl::free(p, default_queue);
+  }
+};
+template <class T, class U>
+constexpr bool operator==(sycl_device_default_allocator<T> const &,
+                          sycl_device_default_allocator<U> const &) noexcept {
+  return true;
+}
+template <class T, class U>
+constexpr bool operator!=(sycl_device_default_allocator<T> const &,
+                          sycl_device_default_allocator<U> const &) noexcept {
+  return false;
+}
+
+} // end namespace detail
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
new file mode 100644
index 00000000..13b5241b
--- /dev/null
+++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_RECYCLING_ALLOCATORS_HPP
+#define HIP_RECYCLING_ALLOCATORS_HPP
+
+#include "buffer_management_interface.hpp"
+// import hip_pinned_allocator and hip_device_allocator
+#include "detail/hip_underlying_allocators.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+// Tell cppuddle how to select the device for the hip allocators
+namespace device_selection {
+/// GPU device selector using the HIP API for pinned host allocations
+template <typename T>
+struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+/// GPU selector using the HIP API for pinned host allocations
+template <typename T>
+struct select_device_functor<T, detail::hip_device_allocator<T>> {
+  void operator()(const size_t device_id) { hipSetDevice(device_id); }
+};
+} // namespace device_selection
+
+/// Recycling allocator for HIP pinned host memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_hip_host =
+    detail::aggressive_recycle_allocator<T, detail::hip_pinned_allocator<T>>;
+/// Recycling allocator for HIP device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_hip_device =
+    detail::recycle_allocator<T, detail::hip_device_allocator<T>>;
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/std_recycling_allocators.hpp b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp
new file mode 100644
index 00000000..21fd5c2c
--- /dev/null
+++ b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp
@@ -0,0 +1,35 @@
+// Copyright (c) 2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef STD_RECYCLING_ALLOCATORS_HPP
+#define STD_RECYCLING_ALLOCATORS_HPP
+
+#include "buffer_management_interface.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+namespace device_selection {
+/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default /
+/// select_device_functor does not compile for > 1 GPU (to make sure all /
+/// relevant allocators support multigpu)
+template <typename T> struct select_device_functor<T, std::allocator<T>> {
+  void operator()(const size_t device_id) {}
+};
+} // namespace device_selection
+
+
+/// Recycling allocator for std memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_std = detail::recycle_allocator<T, std::allocator<T>>;
+/// Recycling allocator for boost aligned memory (reusing previous content as well)
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using aggressive_recycle_std =
+    detail::aggressive_recycle_allocator<T, std::allocator<T>>;
+
+} // namespace memory_recycling
+} // namespace cppuddle
+
+#endif
diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
new file mode 100644
index 00000000..fd494bca
--- /dev/null
+++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef SYCL_RECYCLING_ALLOCATORS_HPP
+#define SYCL_RECYCLING_ALLOCATORS_HPP
+
+#include "buffer_management_interface.hpp"
+#include "detail/sycl_underlying_allocators.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+namespace device_selection {
+// No MutliGPU support yet, hence no select_device_function required
+static_assert(max_number_gpus <= 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
+} // namespace device_selection
+
+/// Recycling allocator for SYCL pinned host memory (default device)
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_sycl_host =
+    detail::aggressive_recycle_allocator<T, detail::sycl_host_default_allocator<T>>;
+/// Recycling allocator for SYCL device memory (default device)
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using recycle_allocator_sycl_device =
+    detail::recycle_allocator<T, detail::sycl_device_default_allocator<T>>;
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp
new file mode 100644
index 00000000..dbd7e4c8
--- /dev/null
+++ b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp
@@ -0,0 +1,66 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef CUDA_RECYCLING_BUFFER_HPP
+#define CUDA_RECYCLING_BUFFER_HPP
+
+// import recycle_allocator_cuda_device
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp""
+
+namespace cppuddle {
+namespace memory_recycling {
+
+
+/// RAII wrapper for CUDA device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct cuda_device_buffer {
+  recycle_allocator_cuda_device<T> allocator;
+  T *device_side_buffer;
+  size_t number_of_elements;
+
+  cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
+    device_side_buffer =
+        allocator.allocate(number_of_elements);
+  }
+  ~cuda_device_buffer() {
+    allocator.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  cuda_device_buffer(cuda_device_buffer const &other) = delete;
+  cuda_device_buffer operator=(cuda_device_buffer const &other) = delete;
+  cuda_device_buffer(cuda_device_buffer const &&other) = delete;
+  cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete;
+
+};
+
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct cuda_aggregated_device_buffer {
+  T *device_side_buffer;
+  size_t number_of_elements;
+  cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
+    device_side_buffer =
+        alloc.allocate(number_of_elements);
+  }
+  ~cuda_aggregated_device_buffer() {
+    alloc.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete;
+  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete;
+  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete;
+  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete;
+
+private:
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
+};
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp
new file mode 100644
index 00000000..7f04e3f7
--- /dev/null
+++ b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef HIP_RECYCLING_BUFFER_HPP
+#define HIP_RECYCLING_BUFFER_HPP
+
+// import recycle_allocator_hip_device
+#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp"
+
+namespace cppuddle {
+namespace memory_recycling {
+
+/// RAII wrapper for HIP device memory
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct hip_device_buffer {
+  recycle_allocator_hip_device<T> allocator;
+  T *device_side_buffer;
+  size_t number_of_elements;
+
+  hip_device_buffer(size_t number_of_elements, size_t device_id)
+      : allocator{device_id}, number_of_elements(number_of_elements) {
+    assert(device_id < max_number_gpus);
+    device_side_buffer =
+        allocator.allocate(number_of_elements);
+  }
+  ~hip_device_buffer() {
+    allocator.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  hip_device_buffer(hip_device_buffer const &other) = delete;
+  hip_device_buffer operator=(hip_device_buffer const &other) = delete;
+  hip_device_buffer(hip_device_buffer const &&other) = delete;
+  hip_device_buffer operator=(hip_device_buffer const &&other) = delete;
+
+};
+
+/// RAII wrapper for CUDA device memory using a passed aggregated allocator
+template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+struct hip_aggregated_device_buffer {
+  T *device_side_buffer;
+  size_t number_of_elements;
+  hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
+      : number_of_elements(number_of_elements), alloc(alloc) {
+    device_side_buffer =
+        alloc.allocate(number_of_elements);
+  }
+  ~hip_aggregated_device_buffer() {
+    alloc.deallocate(device_side_buffer, number_of_elements);
+  }
+  // not yet implemented
+  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete;
+  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete;
+  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete;
+  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete;
+
+private:
+  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
+                         // for the entire lifetime of this buffer
+};
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+#endif
diff --git a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
new file mode 100644
index 00000000..b8ca526c
--- /dev/null
+++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp
@@ -0,0 +1,171 @@
+// Copyright (c) 2020-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef RECYCLING_KOKKOS_VIEW_HPP
+#define RECYCLING_KOKKOS_VIEW_HPP
+#include <Kokkos_Core.hpp>
+#include <memory>
+#include <type_traits>
+
+#include "cppuddle/memory_recycling/buffer_management_interface.hpp"
+
+
+namespace cppuddle {
+namespace memory_recycling {
+
+
+template<typename element_type, typename alloc_type>
+struct view_deleter {
+  alloc_type allocator;
+  size_t total_elements;
+  view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc),
+    total_elements(total_elements) {}
+  void operator()(element_type* p) {
+    allocator.deallocate(p, total_elements);
+  }
+};
+
+template <typename kokkos_type, typename alloc_type, typename element_type>
+class aggregated_recycling_view : public kokkos_type {
+private:
+  alloc_type allocator;
+  size_t total_elements{0};
+  std::shared_ptr<element_type> data_ref_counter;
+  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
+
+public:
+  using view_type = kokkos_type;
+  template <class... Args>
+  explicit aggregated_recycling_view(alloc_type &alloc, Args... args)
+      : kokkos_type(
+            alloc.allocate(kokkos_type::required_allocation_size(args...) /
+                           sizeof(element_type)),
+            args...),
+        total_elements(kokkos_type::required_allocation_size(args...) /
+                       sizeof(element_type)),
+        allocator(alloc),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           alloc, total_elements)) {}
+
+  aggregated_recycling_view(
+      const aggregated_recycling_view<kokkos_type, alloc_type, element_type> &other)
+      : kokkos_type(other), allocator(other.allocator) {
+    data_ref_counter = other.data_ref_counter;
+    total_elements = other.total_elements;
+  }
+
+  aggregated_recycling_view<kokkos_type, alloc_type, element_type> &
+  operator=(const aggregated_recycling_view<kokkos_type, alloc_type, element_type> &other) {
+    data_ref_counter = other.data_ref_counter;
+    allocator = other.allocator;
+    kokkos_type::operator=(other);
+    total_elements = other.total_elements;
+    return *this;
+  }
+
+  aggregated_recycling_view(
+      aggregated_recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+      : kokkos_type(other), allocator(other.allocator) {
+    data_ref_counter = other.data_ref_counter;
+    total_elements = other.total_elements;
+  }
+
+  aggregated_recycling_view<kokkos_type, alloc_type, element_type> &operator=(
+      aggregated_recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+    data_ref_counter = other.data_ref_counter;
+    allocator = other.allocator;
+    kokkos_type::operator=(other);
+    total_elements = other.total_elements;
+    return *this;
+  }
+
+  ~aggregated_recycling_view() {}
+};
+
+
+template <typename kokkos_type, typename alloc_type, typename element_type>
+class recycling_view : public kokkos_type {
+private:
+  size_t total_elements{0};
+  std::shared_ptr<element_type> data_ref_counter;
+
+public:
+  using view_type = kokkos_type;
+  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
+  template <typename... Args,
+            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
+  recycling_view(Args... args)
+      : kokkos_type(
+            alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) /
+                               sizeof(element_type)),
+            args...),
+        total_elements(kokkos_type::required_allocation_size(args...) /
+                       sizeof(element_type)),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           alloc_type{}, total_elements)) {}
+
+  template <typename... Args,
+            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
+  recycling_view(const size_t device_id, Args... args)
+      : kokkos_type(
+            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) /
+                               sizeof(element_type)),
+            args...),
+        total_elements(kokkos_type::required_allocation_size(args...) /
+                       sizeof(element_type)),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           alloc_type{device_id}, total_elements)) {}
+
+  template <
+      typename layout_t,
+      std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
+  recycling_view(std::size_t device_id, layout_t layout)
+      : kokkos_type(
+            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) /
+                               sizeof(element_type)),
+            layout),
+        total_elements(kokkos_type::required_allocation_size(layout) /
+                       sizeof(element_type)),
+        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
+                                           alloc_type{device_id}, total_elements)) {}
+
+  recycling_view(
+      const recycling_view<kokkos_type, alloc_type, element_type> &other)
+      : kokkos_type(other) {
+    total_elements = other.total_elements;
+    data_ref_counter = other.data_ref_counter;
+
+  }
+
+  recycling_view<kokkos_type, alloc_type, element_type> &
+  operator=(const recycling_view<kokkos_type, alloc_type, element_type> &other) {
+    data_ref_counter = other.data_ref_counter;
+    kokkos_type::operator=(other);
+    total_elements = other.total_elements;
+    return *this;
+  }
+
+  recycling_view(
+      recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept
+      : kokkos_type(other) {
+    data_ref_counter = other.data_ref_counter;
+    total_elements = other.total_elements;
+  }
+
+  recycling_view<kokkos_type, alloc_type, element_type> &operator=(
+      recycling_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
+    data_ref_counter = other.data_ref_counter;
+    kokkos_type::operator=(other);
+    total_elements = other.total_elements;
+    return *this;
+  }
+
+  ~recycling_view() {  }
+};
+
+} // namespace memory_recycling
+} // end namespace cppuddle
+
+#endif
diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp
index 55d3397a..7fbd07be 100644
--- a/include/cuda_buffer_util.hpp
+++ b/include/cuda_buffer_util.hpp
@@ -1,170 +1,54 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef CUDA_BUFFER_UTIL_HPP
 #define CUDA_BUFFER_UTIL_HPP
 
 #include "buffer_manager.hpp"
-#include "detail/config.hpp"
-
-#include <cuda_runtime.h>
-#include <stdexcept>
-#include <string>
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
 
 namespace recycler {
-
 namespace detail {
 
+template <class T>
+using cuda_pinned_allocator
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::cuda_pinned_allocator<T>;
 
-
-template <class T> struct cuda_pinned_allocator {
-  using value_type = T;
-  cuda_pinned_allocator() noexcept = default;
-  template <class U>
-  explicit cuda_pinned_allocator(cuda_pinned_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    cudaError_t error =
-        cudaMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_pinned_allocator failed due to cudaMallocHost failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    cudaError_t error = cudaFreeHost(p);
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_pinned_allocator failed due to cudaFreeHost failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-
-template <class T, class U>
-constexpr bool operator==(cuda_pinned_allocator<T> const &,
-                          cuda_pinned_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(cuda_pinned_allocator<T> const &,
-                          cuda_pinned_allocator<U> const &) noexcept {
-  return false;
-}
-
-template <class T> struct cuda_device_allocator {
-  using value_type = T;
-  cuda_device_allocator() noexcept = default;
-  template <class U>
-  explicit cuda_device_allocator(cuda_device_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    cudaError_t error = cudaMalloc(&data, n * sizeof(T));
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_device_allocator failed due to cudaMalloc failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    cudaError_t error = cudaFree(p);
-    if (error != cudaSuccess) {
-      std::string msg =
-          std::string(
-              "cuda_device_allocator failed due to cudaFree failure : ") +
-          std::string(cudaGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(cuda_device_allocator<T> const &,
-                          cuda_device_allocator<U> const &) noexcept {
-  return false;
-}
-
+template <class T>
+using cuda_device_allocator
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::cuda_device_allocator<T>;
 
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_cuda_host =
-    detail::aggressive_recycle_allocator<T, detail::cuda_pinned_allocator<T>>;
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_cuda_device =
-    detail::recycle_allocator<T, detail::cuda_device_allocator<T>>;
+using recycle_allocator_cuda_host
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_cuda_host<T>;
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct cuda_device_buffer {
-  recycle_allocator_cuda_device<T> allocator;
-  T *device_side_buffer;
-  size_t number_of_elements;
-
-  cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0)
-      : allocator{device_id}, number_of_elements(number_of_elements) {
-    assert(device_id < max_number_gpus);
-    device_side_buffer =
-        allocator.allocate(number_of_elements);
-  }
-  ~cuda_device_buffer() {
-    allocator.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  cuda_device_buffer(cuda_device_buffer const &other) = delete;
-  cuda_device_buffer operator=(cuda_device_buffer const &other) = delete;
-  cuda_device_buffer(cuda_device_buffer const &&other) = delete;
-  cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete;
-
-};
+using recycle_allocator_cuda_device
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_cuda_device<T>;
 
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct cuda_aggregated_device_buffer {
-  T *device_side_buffer;
-  size_t number_of_elements;
-  cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
-      : number_of_elements(number_of_elements), alloc(alloc) {
-    device_side_buffer =
-        alloc.allocate(number_of_elements);
-  }
-  ~cuda_aggregated_device_buffer() {
-    alloc.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete;
-  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete;
-  cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete;
-  cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete;
-
-private:
-  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
-                         // for the entire lifetime of this buffer
-};
-
-namespace device_selection {
-template <typename T>
-struct select_device_functor<T, detail::cuda_pinned_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
-};
-template <typename T>
-struct select_device_functor<T, detail::cuda_device_allocator<T>> {
-  void operator()(const size_t device_id) { cudaSetDevice(device_id); }
-};
-} // namespace device_selection
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_device_buffer
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::cuda_device_buffer<T>;
+
+template <typename T, typename Host_Allocator,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using cuda_aggregated_device_buffer
+    [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::cuda_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp
index e2364095..720baf70 100644
--- a/include/hip_buffer_util.hpp
+++ b/include/hip_buffer_util.hpp
@@ -1,172 +1,53 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef HIP_BUFFER_UTIL_HPP
 #define HIP_BUFFER_UTIL_HPP
 
-#include "buffer_manager.hpp"
-
-#include <hip/hip_runtime.h>
-#include <stdexcept>
-#include <string>
+#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp"
 
 namespace recycler {
 
 namespace detail {
 
-template <class T> struct hip_pinned_allocator {
-  using value_type = T;
-  hip_pinned_allocator() noexcept = default;
-  template <class U>
-  explicit hip_pinned_allocator(hip_pinned_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    // hipError_t error =
-    //     hipMallocHost(reinterpret_cast<void **>(&data), n * sizeof(T));
-    
-    // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc 
-    // (not hipMallocHost) for async memcpys 
-    // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync
-    hipError_t error =
-        hipHostMalloc(reinterpret_cast<void **>(&data), n * sizeof(T));
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_pinned_allocator failed due to hipMallocHost failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    hipError_t error = hipHostFree(p);
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_pinned_allocator failed due to hipFreeHost failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(hip_pinned_allocator<T> const &,
-                          hip_pinned_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(hip_pinned_allocator<T> const &,
-                          hip_pinned_allocator<U> const &) noexcept {
-  return false;
-}
-
-template <class T> struct hip_device_allocator {
-  using value_type = T;
-  hip_device_allocator() noexcept = default;
-  template <class U>
-  explicit hip_device_allocator(hip_device_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    T *data;
-    hipError_t error = hipMalloc(&data, n * sizeof(T));
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_device_allocator failed due to hipMalloc failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    hipError_t error = hipFree(p);
-    if (error != hipSuccess) {
-      std::string msg =
-          std::string(
-              "hip_device_allocator failed due to hipFree failure : ") +
-          std::string(hipGetErrorString(error));
-      throw std::runtime_error(msg);
-    }
-  }
-};
-template <class T, class U>
-constexpr bool operator==(hip_device_allocator<T> const &,
-                          hip_device_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(hip_device_allocator<T> const &,
-                          hip_device_allocator<U> const &) noexcept {
-  return false;
-}
+template <class T>
+using hip_pinned_allocator
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::hip_pinned_allocator<T>;
 
+template <class T>
+using hip_device_allocator
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::hip_device_allocator<T>;
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_hip_host =
-    detail::aggressive_recycle_allocator<T, detail::hip_pinned_allocator<T>>;
-template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_hip_device =
-    detail::recycle_allocator<T, detail::hip_device_allocator<T>>;
+using recycle_allocator_hip_host
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_hip_host<T>;
 
-// TODO Is this even required? (cuda version should work fine...)
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct hip_device_buffer {
-  recycle_allocator_hip_device<T> allocator;
-  T *device_side_buffer;
-  size_t number_of_elements;
-
-  hip_device_buffer(size_t number_of_elements, size_t device_id)
-      : allocator{device_id}, number_of_elements(number_of_elements) {
-    assert(device_id < max_number_gpus);
-    device_side_buffer =
-        allocator.allocate(number_of_elements);
-  }
-  ~hip_device_buffer() {
-    allocator.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  hip_device_buffer(hip_device_buffer const &other) = delete;
-  hip_device_buffer operator=(hip_device_buffer const &other) = delete;
-  hip_device_buffer(hip_device_buffer const &&other) = delete;
-  hip_device_buffer operator=(hip_device_buffer const &&other) = delete;
+using recycle_allocator_hip_device
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_hip_device<T>;
 
-};
-
-template <typename T, typename Host_Allocator, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-struct hip_aggregated_device_buffer {
-  T *device_side_buffer;
-  size_t number_of_elements;
-  hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc)
-      : number_of_elements(number_of_elements), alloc(alloc) {
-    device_side_buffer =
-        alloc.allocate(number_of_elements);
-  }
-  ~hip_aggregated_device_buffer() {
-    alloc.deallocate(device_side_buffer, number_of_elements);
-  }
-  // not yet implemented
-  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete;
-  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete;
-  hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete;
-  hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete;
-
-private:
-  Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence
-                         // for the entire lifetime of this buffer
-};
-
-namespace device_selection {
-template <typename T>
-struct select_device_functor<T, detail::hip_pinned_allocator<T>> {
-  void operator()(const size_t device_id) { hipSetDevice(device_id); }
-};
-template <typename T>
-struct select_device_functor<T, detail::hip_device_allocator<T>> {
-  void operator()(const size_t device_id) { hipSetDevice(device_id); }
-};
-} // namespace device_selection
+template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_device_buffer
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::hip_device_buffer<T>;
+
+template <typename T, typename Host_Allocator,
+          std::enable_if_t<std::is_trivial<T>::value, int> = 0>
+using hip_aggregated_device_buffer
+    [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::hip_aggregated_device_buffer<T, Host_Allocator>;
 
 } // end namespace recycler
 #endif
diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp
index 2945b422..716229a0 100644
--- a/include/kokkos_buffer_util.hpp
+++ b/include/kokkos_buffer_util.hpp
@@ -1,166 +1,28 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef KOKKOS_BUFFER_UTIL_HPP
 #define KOKKOS_BUFFER_UTIL_HPP
-#include <Kokkos_Core.hpp>
-#include <memory>
-#include <buffer_manager.hpp>
-#include <type_traits>
-
-namespace recycler {
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
 
-template<typename element_type, typename alloc_type>
-struct view_deleter {
-  alloc_type allocator;
-  size_t total_elements;
-  view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc),
-    total_elements(total_elements) {}
-  void operator()(element_type* p) {
-    allocator.deallocate(p, total_elements);
-  }
-};
 
+namespace recycler {
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class aggregated_recycled_view : public kokkos_type {
-private:
-  alloc_type allocator;
-  size_t total_elements{0};
-  std::shared_ptr<element_type> data_ref_counter;
-  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
-
-public:
-  using view_type = kokkos_type;
-  template <class... Args>
-  explicit aggregated_recycled_view(alloc_type &alloc, Args... args)
-      : kokkos_type(
-            alloc.allocate(kokkos_type::required_allocation_size(args...) /
-                           sizeof(element_type)),
-            args...),
-        total_elements(kokkos_type::required_allocation_size(args...) /
-                       sizeof(element_type)),
-        allocator(alloc),
-        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           alloc, total_elements)) {}
-
-  aggregated_recycled_view(
-      const aggregated_recycled_view<kokkos_type, alloc_type, element_type> &other)
-      : kokkos_type(other), allocator(other.allocator) {
-    data_ref_counter = other.data_ref_counter;
-    total_elements = other.total_elements;
-  }
-
-  aggregated_recycled_view<kokkos_type, alloc_type, element_type> &
-  operator=(const aggregated_recycled_view<kokkos_type, alloc_type, element_type> &other) {
-    data_ref_counter = other.data_ref_counter;
-    allocator = other.allocator;
-    kokkos_type::operator=(other);
-    total_elements = other.total_elements;
-    return *this;
-  }
-
-  aggregated_recycled_view(
-      aggregated_recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept
-      : kokkos_type(other), allocator(other.allocator) {
-    data_ref_counter = other.data_ref_counter;
-    total_elements = other.total_elements;
-  }
-
-  aggregated_recycled_view<kokkos_type, alloc_type, element_type> &operator=(
-      aggregated_recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
-    data_ref_counter = other.data_ref_counter;
-    allocator = other.allocator;
-    kokkos_type::operator=(other);
-    total_elements = other.total_elements;
-    return *this;
-  }
-
-  ~aggregated_recycled_view() {}
-};
-
+using aggregated_recycled_view [[deprecated(
+    "Use aggregated_recycle_view from header recycling_kokkos_view.hpp "
+    "instead")]] =
+    cppuddle::memory_recycling::aggregated_recycling_view<kokkos_type, alloc_type, element_type>;
 
 template <typename kokkos_type, typename alloc_type, typename element_type>
-class recycled_view : public kokkos_type {
-private:
-  size_t total_elements{0};
-  std::shared_ptr<element_type> data_ref_counter;
-
-public:
-  using view_type = kokkos_type;
-  static_assert(std::is_same_v<element_type, typename alloc_type::value_type>);
-  template <typename... Args,
-            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(Args... args)
-      : kokkos_type(
-            alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) /
-                               sizeof(element_type)),
-            args...),
-        total_elements(kokkos_type::required_allocation_size(args...) /
-                       sizeof(element_type)),
-        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           alloc_type{}, total_elements)) {}
-
-  template <typename... Args,
-            std::enable_if_t<sizeof...(Args) == kokkos_type::rank, bool> = true>
-  recycled_view(const size_t device_id, Args... args)
-      : kokkos_type(
-            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) /
-                               sizeof(element_type)),
-            args...),
-        total_elements(kokkos_type::required_allocation_size(args...) /
-                       sizeof(element_type)),
-        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           alloc_type{device_id}, total_elements)) {}
-
-  template <
-      typename layout_t,
-      std::enable_if_t<Kokkos::is_array_layout<layout_t>::value, bool> = true>
-  recycled_view(std::size_t device_id, layout_t layout)
-      : kokkos_type(
-            alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) /
-                               sizeof(element_type)),
-            layout),
-        total_elements(kokkos_type::required_allocation_size(layout) /
-                       sizeof(element_type)),
-        data_ref_counter(this->data(), view_deleter<element_type, alloc_type>(
-                                           alloc_type{device_id}, total_elements)) {}
-
-  recycled_view(
-      const recycled_view<kokkos_type, alloc_type, element_type> &other)
-      : kokkos_type(other) {
-    total_elements = other.total_elements;
-    data_ref_counter = other.data_ref_counter;
-
-  }
-
-  recycled_view<kokkos_type, alloc_type, element_type> &
-  operator=(const recycled_view<kokkos_type, alloc_type, element_type> &other) {
-    data_ref_counter = other.data_ref_counter;
-    kokkos_type::operator=(other);
-    total_elements = other.total_elements;
-    return *this;
-  }
-
-  recycled_view(
-      recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept
-      : kokkos_type(other) {
-    data_ref_counter = other.data_ref_counter;
-    total_elements = other.total_elements;
-  }
-
-  recycled_view<kokkos_type, alloc_type, element_type> &operator=(
-      recycled_view<kokkos_type, alloc_type, element_type> &&other) noexcept {
-    data_ref_counter = other.data_ref_counter;
-    kokkos_type::operator=(other);
-    total_elements = other.total_elements;
-    return *this;
-  }
-
-  ~recycled_view() {  }
-};
-
+using recycled_view [[deprecated(
+    "Use recycle_view from header recycling_kokkos_view.hpp instead")]] =
+    cppuddle::memory_recycling::recycling_view<kokkos_type, alloc_type, element_type>;
 
 } // end namespace recycler
 
diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp
index 5b0e3898..1e781442 100644
--- a/include/stream_manager.hpp
+++ b/include/stream_manager.hpp
@@ -1,413 +1,38 @@
-// Copyright (c) 2020-2023 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef STREAM_MANAGER_HPP
 #define STREAM_MANAGER_HPP
 
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <deque>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <tuple>
-#include <type_traits>
-
-#include "../include/detail/config.hpp"
-
-// Need to cuda/hip definitions for default params when NOT
-// drawing from an executor pool
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-#include <hpx/config.hpp>
-#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP)
-#include <hpx/async_cuda/cuda_executor.hpp>
-#endif
-#endif
-
-// Redefintion required for non-recycling executors
-// Without it, default constructing the executors (independent) would not work
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-// Do only define if Kokkos is not found
-#ifndef KOKKOS_ENABLE_SERIAL
-namespace hpx { namespace kokkos {
-enum class execution_space_mode { global, independent };
-}}
-#endif
-#endif
-
-/// Turns a std::array_mutex into an scoped lock
-template<typename mutex_array_t>
-auto make_scoped_lock_from_array(mutex_array_t& mutexes)
-{
-    return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, 
-                      mutexes);
-}
-
-template <class Interface> class round_robin_pool {
-private:
-  std::deque<Interface> pool{};
-  std::vector<size_t> ref_counters{};
-  size_t current_interface{0};
-
-public:
-  template <typename... Ts>
-  round_robin_pool(size_t number_of_streams, Ts... executor_args) {
-    ref_counters.reserve(number_of_streams);
-    for (int i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(executor_args...);
-      ref_counters.emplace_back(0);
-    }
-  }
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    assert(!(pool.empty())); 
-    size_t last_interface = current_interface;
-    current_interface = (current_interface + 1) % pool.size();
-    ref_counters[last_interface]++;
-    std::tuple<Interface &, size_t> ret(pool[last_interface], last_interface);
-    return ret;
-  }
-  void release_interface(size_t index) { ref_counters[index]--; }
-  bool interface_available(size_t load_limit) {
-    return *(std::min_element(std::begin(ref_counters),
-                              std::end(ref_counters))) < load_limit;
-  }
-  size_t get_current_load() {
-    return *(
-        std::min_element(std::begin(ref_counters), std::end(ref_counters)));
-  }
-  // TODO Remove
-  /* size_t get_next_device_id() { */
-  /*   return 0; // single gpu pool */
-  /* } */
-};
-
-template <class Interface> class priority_pool {
-private:
-  std::deque<Interface> pool{};
-  std::vector<size_t> ref_counters{}; // Ref counters
-  std::vector<size_t> priorities{};   // Ref counters
-public:
-  template <typename... Ts>
-  priority_pool(size_t number_of_streams, Ts... executor_args) {
-    ref_counters.reserve(number_of_streams);
-    priorities.reserve(number_of_streams);
-    for (auto i = 0; i < number_of_streams; i++) {
-      pool.emplace_back(executor_args...);
-      ref_counters.emplace_back(0);
-      priorities.emplace_back(i);
-    }
-  }
-  // return a tuple with the interface and its index (to release it later)
-  std::tuple<Interface &, size_t> get_interface() {
-    auto &interface = pool[priorities[0]];
-    ref_counters[priorities[0]]++;
-    std::tuple<Interface &, size_t> ret(interface, priorities[0]);
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-    return ret;
-  }
-  void release_interface(size_t index) {
-    ref_counters[index]--;
-    std::make_heap(std::begin(priorities), std::end(priorities),
-                   [this](const size_t &first, const size_t &second) -> bool {
-                     return ref_counters[first] > ref_counters[second];
-                   });
-  }
-  bool interface_available(size_t load_limit) {
-    return ref_counters[priorities[0]] < load_limit;
-  }
-  size_t get_current_load() { return ref_counters[priorities[0]]; }
-  // TODO remove
-  /* size_t get_next_device_id() { */
-  /*   return 0; // single gpu pool */
-  /* } */
-};
-
-/// Access/Concurrency Control for stream pool implementation
-class stream_pool {
-public:
-  template <class Interface, class Pool, typename... Ts>
-  static void init(size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init(number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool, typename... Ts>
-  static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init_all_executor_pools(number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool, typename... Ts>
-  static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) {
-    stream_pool_implementation<Interface, Pool>::init_executor_pool(pool_id, number_of_streams,
-                                                      executor_args...);
-  }
-  template <class Interface, class Pool> static void cleanup() {
-    stream_pool_implementation<Interface, Pool>::cleanup();
-  }
-  template <class Interface, class Pool>
-  static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id) {
-    return stream_pool_implementation<Interface, Pool>::get_interface(gpu_id);
-  }
-  template <class Interface, class Pool>
-  static void release_interface(size_t index, const size_t gpu_id) noexcept {
-    stream_pool_implementation<Interface, Pool>::release_interface(index,
-        gpu_id);
-  }
-  template <class Interface, class Pool>
-  static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept {
-    return stream_pool_implementation<Interface, Pool>::interface_available(
-        load_limit, gpu_id);
-  }
-  template <class Interface, class Pool>
-  static size_t get_current_load(const size_t gpu_id = 0) noexcept {
-    return stream_pool_implementation<Interface, Pool>::get_current_load(
-        gpu_id);
-  }
-  template <class Interface, class Pool>
-  static size_t get_next_device_id(const size_t number_gpus) noexcept {
-    // TODO add round robin and min strategy
-    return recycler::get_device_id(number_gpus);
-  }
-
-  template <class Interface, class Pool>
-  static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-    stream_pool_implementation<Interface, Pool>::set_device_selector(select_gpu_function);
-  }
-
-  template <class Interface, class Pool>
-  static void select_device(size_t gpu_id) {
-    stream_pool_implementation<Interface, Pool>::select_device(gpu_id);
-  }
-
-private:
-  stream_pool() = default;
-
-private:
-  template <class Interface, class Pool> class stream_pool_implementation {
-  public:
-    /// Deprecated! Use init_on_all_gpu or init_on_gpu
-    template <typename... Ts>
-    static void init(size_t number_of_streams, Ts ... executor_args) {
-      /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */
-      /*               "deprecated stream_pool::init does not support multigpu"); */
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      instance().streampools.emplace_back(number_of_streams, executor_args...);
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
-    }
-
-    /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments
-    template <typename... Ts>
-    static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      if (number_of_streams > 0) {
-        for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) {
-          instance().select_gpu_function(gpu_id);
-          instance().streampools.emplace_back(number_of_streams,
-                                              executor_args...);
-        }
-      }
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
-    }
-
-    /// Per-GPU init allowing for different init parameters depending on the GPU 
-    /// (useful for executor that expect an GPU-id during construction)
-    template <typename... Ts>
-    static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      if (number_of_streams > 0) {
-        instance().select_gpu_function(gpu_id);
-        instance().streampools.emplace_back(number_of_streams, 
-                                            executor_args...);
-      }
-      assert(instance().streampools.size() <= recycler::max_number_gpus);
-    }
-
-    // TODO add/rename into finalize?
-    static void cleanup() {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      assert(instance().streampools.size() == recycler::max_number_gpus);
-      instance().streampools.clear();
-    }
-
-    static std::tuple<Interface &, size_t> get_interface(const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].get_interface();
-    }
-    static void release_interface(size_t index, const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      instance().streampools[gpu_id].release_interface(index);
-    }
-    static bool interface_available(size_t load_limit, const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].interface_available(load_limit);
-    }
-    static size_t get_current_load(const size_t gpu_id = 0) {
-      std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]);
-      assert(gpu_id < instance().streampools.size());
-      return instance().streampools[gpu_id].get_current_load();
-    }
-    // TODO deprecated! Remove...
-    /* static size_t get_next_device_id(const size_t gpu_id = 0) { */
-    /*   std::lock_guard<recycler::mutex_t> guard(instance().gpu_mutexes[gpu_id]); */
-    /*   assert(instance().streampools.size() == recycler::max_number_gpus); */
-    /*   return instance().streampools[gpu_id].get_next_device_id(); */
-    /* } */
-
-    static void set_device_selector(std::function<void(size_t)> select_gpu_function) {
-      auto guard = make_scoped_lock_from_array(instance().gpu_mutexes);
-      instance().select_gpu_function = select_gpu_function;
-    }
-
-    static void select_device(size_t gpu_id) {
-      instance().select_gpu_function(gpu_id);
-    }
-
-  private:
-    stream_pool_implementation() = default;
-    recycler::mutex_t pool_mut{};
-    std::function<void(size_t)> select_gpu_function = [](size_t gpu_id) {
-      // By default no multi gpu support
-      assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1);
-      assert(gpu_id == 0);
-    };
-
-    std::deque<Pool> streampools{};
-    std::array<recycler::mutex_t, recycler::max_number_gpus> gpu_mutexes;
-
-    static stream_pool_implementation& instance(void) {
-      static stream_pool_implementation pool_instance{};
-      return pool_instance;
-    }
-
-  public:
-    ~stream_pool_implementation() = default;
-    // Bunch of constructors we don't need
-    stream_pool_implementation(stream_pool_implementation const &other) =
-        delete;
-    stream_pool_implementation &
-    operator=(stream_pool_implementation const &other) = delete;
-    stream_pool_implementation(stream_pool_implementation &&other) = delete;
-    stream_pool_implementation &
-    operator=(stream_pool_implementation &&other) = delete;
-  };
-
-public:
-  ~stream_pool() = default;
-  // Bunch of constructors we don't need
-  stream_pool(stream_pool const &other) = delete;
-  stream_pool &operator=(stream_pool const &other) = delete;
-  stream_pool(stream_pool &&other) = delete;
-  stream_pool &operator=(stream_pool &&other) = delete;
-};
-
-#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING)
-
-// Warn about suboptimal performance without recycling
-#pragma message                                                                \
-"Warning: Building without executor recycling! Use only for performance testing! \
-For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!"
-
-/// Slow version of the stream_interface that does not draw its
-/// executors (Interface) from the pool but creates them instead.
-/// Only meant for performance comparisons and only works with cuda/kokkos executors
-template <class Interface, class Pool> class stream_interface {
-public:
-
-  template <class Dummy = Interface>
-  explicit stream_interface(size_t gpu_id,
-      std::enable_if_t<std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
-      : gpu_id(gpu_id), interface(gpu_id) {}
-  template <class Dummy = Interface>
-  explicit stream_interface(std::enable_if_t<!std::is_same<hpx::cuda::experimental::cuda_executor, Dummy>::value, size_t> = 0)
-      : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {}
-
-  stream_interface(const stream_interface &other) = delete;
-  stream_interface &operator=(const stream_interface &other) = delete;
-  stream_interface(stream_interface &&other) = delete;
-  stream_interface &operator=(stream_interface &&other) = delete;
-  ~stream_interface() {
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) post(F &&f, Ts &&... ts) {
-    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
-    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  inline decltype(auto) get_future() {
-    return interface.get_future();
-  }
-
-  // allow implict conversion
-  operator Interface &() { // NOLINT
-    return interface;
-  }
-
-private:
-  size_t gpu_id;
-
-public:
-  Interface interface;
-};
-#else
-/// Stream interface for RAII purposes
-/// Draws executor from the stream pool and releases it upon
-/// destruction
-template <class Interface, class Pool> class stream_interface {
-public:
-  explicit stream_interface(size_t gpu_id)
-      : t(stream_pool::get_interface<Interface, Pool>(gpu_id)),
-        interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {}
-
-  stream_interface(const stream_interface &other) = delete;
-  stream_interface &operator=(const stream_interface &other) = delete;
-  stream_interface(stream_interface &&other) = delete;
-  stream_interface &operator=(stream_interface &&other) = delete;
-  ~stream_interface() {
-    stream_pool::release_interface<Interface, Pool>(interface_index, gpu_id);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) post(F &&f, Ts &&... ts) {
-    return interface.post(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  template <typename F, typename... Ts>
-  inline decltype(auto) async_execute(F &&f, Ts &&... ts) {
-    return interface.async_execute(std::forward<F>(f), std::forward<Ts>(ts)...);
-  }
-
-  inline decltype(auto) get_future() {
-    return interface.get_future();
-  }
-
-  // allow implict conversion
-  operator Interface &() { // NOLINT
-    return interface;
-  }
-
-private:
-  std::tuple<Interface &, size_t> t;
-  size_t interface_index;
-  size_t gpu_id;
-
-public:
-  Interface &interface;
-};
-#endif
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp"
+
+template <typename Interface>
+using round_robin_pool
+    [[deprecated("Use cppuddle::executor_recycling::round_robin_pool_impl from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::round_robin_pool_impl<Interface>;
+
+template <typename Interface>
+using priority_pool
+    [[deprecated("Use cppuddle::executor_recycling::priority_pool_impl from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::priority_pool_impl<Interface>;
+
+using stream_pool
+    [[deprecated("Use cppuddle::executor_recycling::executor_pool from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::executor_pool;
+
+template <typename Interface, typename Pool>
+using stream_interface
+    [[deprecated("Use cppuddle::executor_recycling::executor_interface from "
+                 "header executor_pools_management.hpp instead")]] =
+        cppuddle::executor_recycling::executor_interface<Interface, Pool>;
 
 #endif
diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp
index 61d22f8f..4bf45b3f 100644
--- a/include/sycl_buffer_util.hpp
+++ b/include/sycl_buffer_util.hpp
@@ -1,83 +1,42 @@
-// Copyright (c: 2020-2021 Gregor Daiß
+// Copyright (c) 2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// DEPRECATED: Do not use this file
+// Only intended to make the old interface work a bit longer.
+// See deprecation warnings for the new location of the functionality
+
 #ifndef SYCL_BUFFER_UTIL_HPP
 #define SYCL_BUFFER_UTIL_HPP
 
-#include "buffer_manager.hpp"
-
-#include <CL/sycl.hpp>
-#include <stdexcept>
-#include <string>
+#include "cppuddle/memory_recycling/sycl_recycling_allocators.hpp"
 
 namespace recycler {
 
 namespace detail {
 
-static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!");
+template <class T>
+using sycl_host_default_allocator
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::sycl_host_default_allocator<T>;
 
-template <class T> struct sycl_host_default_allocator {
-  using value_type = T;
-  sycl_host_default_allocator() noexcept = default;
-  template <class U>
-  explicit sycl_host_default_allocator(sycl_host_default_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    T *data = cl::sycl::malloc_host<T>(n, default_queue);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    cl::sycl::free(p, default_queue);
-  }
-};
-template <class T, class U>
-constexpr bool operator==(sycl_host_default_allocator<T> const &,
-                          sycl_host_default_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(sycl_host_default_allocator<T> const &,
-                          sycl_host_default_allocator<U> const &) noexcept {
-  return false;
-}
-
-template <class T> struct sycl_device_default_allocator {
-  using value_type = T;
-  sycl_device_default_allocator() noexcept = default;
-  template <class U>
-  explicit sycl_device_default_allocator(sycl_device_default_allocator<U> const &) noexcept {}
-  T *allocate(std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    T *data = cl::sycl::malloc_device<T>(n, default_queue);
-    return data;
-  }
-  void deallocate(T *p, std::size_t n) {
-    static cl::sycl::queue default_queue(cl::sycl::default_selector{});
-    cl::sycl::free(p, default_queue);
-  }
-};
-template <class T, class U>
-constexpr bool operator==(sycl_device_default_allocator<T> const &,
-                          sycl_device_default_allocator<U> const &) noexcept {
-  return true;
-}
-template <class T, class U>
-constexpr bool operator!=(sycl_device_default_allocator<T> const &,
-                          sycl_device_default_allocator<U> const &) noexcept {
-  return false;
-}
+template <class T>
+using sycl_device_default_allocator
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::detail::sycl_device_default_allocator<T>;
 
 } // end namespace detail
 
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_sycl_host =
-    detail::aggressive_recycle_allocator<T, detail::sycl_host_default_allocator<T>>;
+using recycle_allocator_sycl_host
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_sycl_host<T>;
+
 template <typename T, std::enable_if_t<std::is_trivial<T>::value, int> = 0>
-using recycle_allocator_sycl_device =
-    detail::recycle_allocator<T, detail::sycl_device_default_allocator<T>>;
+using recycle_allocator_sycl_device
+    [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] =
+        cppuddle::memory_recycling::recycle_allocator_sycl_device<T>;
 
 } // end namespace recycler
 #endif
diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp
index c3c09217..ea9ce9a4 100644
--- a/tests/allocator_aligned_test.cpp
+++ b/tests/allocator_aligned_test.cpp
@@ -3,8 +3,6 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/buffer_manager.hpp"
-#include "../include/aligned_buffer_util.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -18,6 +16,8 @@
 #include <string>
 #include <typeinfo>
 
+#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp"
+
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
 #else
@@ -79,7 +79,9 @@ int main(int argc, char *argv[]) {
               << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::aggressive_recycle_aligned<double, 32>>
+      std::vector<
+          double,
+          cppuddle::memory_recycling::aggressive_recycle_aligned<double, 32>>
         test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
@@ -92,8 +94,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Recycle Test:
@@ -101,8 +103,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::recycle_aligned<double, 32>> test1(
-          array_size, double{});
+      std::vector<double,
+                  cppuddle::memory_recycling::recycle_aligned<double, 32>>
+        test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -114,8 +117,8 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                              // comparison
 
   // Same test using std::allocator:
@@ -124,7 +127,7 @@ int main(int argc, char *argv[]) {
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
       std::vector<double, boost::alignment::aligned_allocator<double, 32>>
-      test2(array_size, double{});
+        test2(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       default_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -146,7 +149,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  recycler::print_performance_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp
index 9d8cc44b..21c4baed 100644
--- a/tests/allocator_hpx_test.cpp
+++ b/tests/allocator_hpx_test.cpp
@@ -15,7 +15,7 @@
 
 #include <boost/program_options.hpp>
 
-#include "../include/buffer_manager.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
 int hpx_main(int argc, char *argv[]) {
 
@@ -112,8 +112,8 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, recycler::recycle_std<double>> test6(array_size,
-                                                                     double{});
+            std::vector<double, cppuddle::memory_recycling::recycle_std<double>>
+              test6(array_size, double{});
           });
         }
       }
@@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Recycle allocation test took " << recycle_duration
                 << "ms" << std::endl;
     }
-    recycler::print_performance_counters();
-    recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::memory_recycling::print_buffer_counters();
+    cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
     // ensure that at least 4 buffers have to created for unit testing
     {
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer1(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer1(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer2(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer2(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer3(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer3(
           array_size, double{});
-      std::vector<double, recycler::aggressive_recycle_std<double>> buffer4(
+      std::vector<double, cppuddle::memory_recycling::aggressive_recycle_std<double>> buffer4(
           array_size, double{});
     }
 
@@ -153,8 +153,10 @@ int hpx_main(int argc, char *argv[]) {
       for (size_t pass = 0; pass < passes; pass++) {
         for (size_t i = 0; i < number_futures; i++) {
           futs[i] = futs[i].then([&](hpx::shared_future<void> &&predecessor) {
-            std::vector<double, recycler::aggressive_recycle_std<double>> test6(
-                array_size, double{});
+            std::vector<
+                double,
+                cppuddle::memory_recycling::aggressive_recycle_std<double>>
+                test6(array_size, double{});
           });
         }
       }
@@ -167,8 +169,8 @@ int hpx_main(int argc, char *argv[]) {
       std::cout << "\n==> Aggressive recycle allocation test took "
                 << aggressive_duration << "ms" << std::endl;
     }
-    recycler::print_performance_counters();
-    recycler::force_cleanup(); // Cleanup all buffers and the managers for better
+    cppuddle::memory_recycling::print_buffer_counters();
+    cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better
                                // comparison
 
 
diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp
index 7708fe56..ad184ff5 100644
--- a/tests/allocator_kokkos_executor_for_loop_test.cpp
+++ b/tests/allocator_kokkos_executor_for_loop_test.cpp
@@ -18,12 +18,13 @@
 #include <cstdio>
 #include <typeinfo>
 
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-#include "../include/kokkos_buffer_util.hpp"
 #include <hpx/timing/high_resolution_timer.hpp>
 #include <memory>
 
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
+
 // Assert during Release builds as well for this file:
 #undef NDEBUG
 #include <cassert> // reinclude the header to update the definition of assert()
@@ -36,18 +37,17 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T **, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_host_view =
-    recycler::recycled_view<kokkos_um_array<T>, recycler::recycle_std<T>, T>;
-
+using recycle_host_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_array<T>, cppuddle::memory_recycling::recycle_std<T>, T>;
 
 // Device views using recycle allocators
 template <class T>
 using kokkos_um_device_array =
     Kokkos::View<T **, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_device_view =
-    recycler::recycled_view<kokkos_um_device_array<T>,
-                            recycler::recycle_allocator_cuda_device<T>, T>;
+using recycle_device_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_device_array<T>,
+    cppuddle::memory_recycling::recycle_allocator_cuda_device<T>, T>;
 
 // Host views using pinned memory recycle allocators
 template <class T>
@@ -55,9 +55,9 @@ using kokkos_um_pinned_array =
     Kokkos::View<T **, typename kokkos_um_device_array<T>::array_layout,
                  Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_pinned_view =
-    recycler::recycled_view<kokkos_um_pinned_array<T>,
-                            recycler::recycle_allocator_cuda_host<T>, T>;
+using recycle_pinned_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_pinned_array<T>,
+    cppuddle::memory_recycling::recycle_allocator_cuda_host<T>, T>;
 
 template <typename Executor, typename ViewType>
 auto get_iteration_policy(const Executor &&executor,
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
   // Host run
   for (size_t pass = 0; pass < passes; pass++) {
     // Create view
-    recycled_host_view<double> hostView(view_size_0, view_size_1);
+    recycle_host_view<double> hostView(view_size_0, view_size_1);
 
     // Create executor
     hpx::kokkos::serial_executor executor;
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
   // Device run
   for (size_t pass = 0; pass < passes; pass++) {
     // Create and init host view
-    recycled_pinned_view<double> hostView(view_size_0, view_size_1);
+    recycle_pinned_view<double> hostView(view_size_0, view_size_1);
     for(size_t i = 0; i < view_size_0; i++) {
       for(size_t j = 0; j < view_size_1; j++) {
         hostView(i, j) = 1.0;
@@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
     hpx::kokkos::cuda_executor executor(hpx::kokkos::execution_space_mode::independent);
 
     // Use executor to move the host data to the device
-   recycled_device_view<double> deviceView(view_size_0, view_size_1);
+   recycle_device_view<double> deviceView(view_size_0, view_size_1);
    Kokkos::deep_copy(executor.instance(), deviceView, hostView); 
 
     auto policy_1 = Kokkos::Experimental::require(
@@ -143,11 +143,11 @@ int main(int argc, char *argv[]) {
 
   // otherwise the HPX cuda polling futures won't work
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
-  recycler::print_performance_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
   // Cleanup all cuda views 
   // (otherwise the cuda driver might shut down before this gets done automatically at
   // the end of the programm)
-  recycler::force_cleanup();
+  cppuddle::memory_recycling::force_buffer_cleanup();
   return hpx::finalize();
 }
 
diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp
index e2770458..e231b557 100644
--- a/tests/allocator_kokkos_test.cpp
+++ b/tests/allocator_kokkos_test.cpp
@@ -13,9 +13,6 @@
 #include <cstdio>
 #include <typeinfo>
 
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-#include "../include/kokkos_buffer_util.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -24,6 +21,10 @@
 #include <boost/program_options.hpp>
 #include <memory>
 
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp"
+
 using kokkos_array =
     Kokkos::View<float[1000], Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
@@ -32,8 +33,8 @@ template <class T>
 using kokkos_um_array =
     Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 template <class T>
-using recycled_host_view =
-    recycler::recycled_view<kokkos_um_array<T>, recycler::recycle_std<T>, T>;
+using recycle_host_view = cppuddle::memory_recycling::recycling_view<
+    kokkos_um_array<T>, cppuddle::memory_recycling::recycle_std<T>, T>;
 
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
@@ -74,8 +75,8 @@ int main(int argc, char *argv[]) {
   hpx::kokkos::ScopeGuard scopeGuard(argc, argv);
   Kokkos::print_configuration(std::cout);
 
-  using test_view = recycled_host_view<float>;
-  using test_double_view = recycled_host_view<double>;
+  using test_view = recycle_host_view<float>;
+  using test_double_view = recycle_host_view<double>;
 
   constexpr size_t passes = 100;
   for (size_t pass = 0; pass < passes; pass++) {
@@ -91,7 +92,7 @@ int main(int argc, char *argv[]) {
                         });
     Kokkos::fence();
   }
-  recycler::print_performance_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp
index 004368a4..9a44664f 100644
--- a/tests/allocator_test.cpp
+++ b/tests/allocator_test.cpp
@@ -3,7 +3,6 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "../include/buffer_manager.hpp"
 #ifdef CPPUDDLE_HAVE_HPX  
 #include <hpx/hpx_init.hpp>
 #endif
@@ -17,6 +16,8 @@
 #include <string>
 #include <typeinfo>
 
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+
 #ifdef CPPUDDLE_HAVE_HPX
 int hpx_main(int argc, char *argv[]) {
 #else
@@ -76,8 +77,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::aggressive_recycle_std<double>> test1(
-          array_size, double{});
+      std::vector<double,
+                  cppuddle::memory_recycling::aggressive_recycle_std<double>>
+      test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       aggressive_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -88,16 +90,17 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Aggressive recycle allocation test took "
               << aggressive_duration << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
-                             // comparison
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+                                    // better comparison
 
   // Recycle Test:
   {
     std::cout << "\nStarting run with recycle allocator: " << std::endl;
     for (size_t pass = 0; pass < passes; pass++) {
       auto begin = std::chrono::high_resolution_clock::now();
-      std::vector<double, recycler::recycle_std<double>> test1(array_size, double{});
+      std::vector<double, cppuddle::memory_recycling::recycle_std<double>>
+        test1(array_size, double{});
       auto end = std::chrono::high_resolution_clock::now();
       recycle_duration +=
           std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
@@ -108,9 +111,9 @@ int main(int argc, char *argv[]) {
     std::cout << "\n\n==> Recycle allocation test took " << recycle_duration
               << "ms" << std::endl;
   }
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers for better
-                             // comparison
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for
+                                    // better comparison
 
   // Same test using std::allocator:
   {
@@ -138,7 +141,7 @@ int main(int argc, char *argv[]) {
     std::cout << "Test information: Aggressive recycler was faster than default allocator!"
               << std::endl;
   }
-  recycler::print_performance_counters();
+  cppuddle::memory_recycling::print_buffer_counters();
 #ifdef CPPUDDLE_HAVE_HPX  
   return hpx::finalize();
 #else
diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp
index 96599759..2e3ebf4c 100644
--- a/tests/stream_test.cpp
+++ b/tests/stream_test.cpp
@@ -1,10 +1,9 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
 #define USE_HPX_MAIN
-#include "../include/stream_manager.hpp"
 #include <hpx/async_cuda/cuda_executor.hpp>
 #ifdef USE_HPX_MAIN
 #include <hpx/hpx_init.hpp>
@@ -26,46 +25,49 @@ int main(int argc, char *argv[]) {
 #endif
   std::cout << "Starting ref counting tests ..." << std::endl;
   test_pool_ref_counting<hpx::cuda::experimental::cuda_executor,
-                         priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
-  test_pool_ref_counting<
-      hpx::cuda::experimental::cuda_executor,
-      round_robin_pool<hpx::cuda::experimental::cuda_executor>>(2, 0, false);
+                         cppuddle::executor_recycling::priority_pool_impl<
+                             hpx::cuda::experimental::cuda_executor>>(2, 0,
+                                                                      false);
+  test_pool_ref_counting<hpx::cuda::experimental::cuda_executor,
+                         cppuddle::executor_recycling::round_robin_pool_impl<
+                             hpx::cuda::experimental::cuda_executor>>(2, 0,
+                                                                      false);
   std::cout << "Finished ref counting tests!" << std::endl;
 
-
   std::cout << "Starting wrapper objects tests ..." << std::endl;
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
-                     priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                     cppuddle::executor_recycling::priority_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   test_pool_wrappers<hpx::cuda::experimental::cuda_executor,
-                     round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                     cppuddle::executor_recycling::round_robin_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   std::cout << "Finished wrapper objects tests!" << std::endl;
 
   std::cout << "Starting memcpy tests... " << std::endl;
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                   cppuddle::executor_recycling::round_robin_pool_impl<
+                       hpx::cuda::experimental::cuda_executor>>(2, 0, false);
   test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                   priority_pool<hpx::cuda::experimental::cuda_executor>>(
-      2, 0, false);
+                   cppuddle::executor_recycling::priority_pool_impl<
+                       hpx::cuda::experimental::cuda_executor>>(2, 0, false);
 
   std::cout << "Finished memcpy tests! " << std::endl;
 
   std::cout << "Starting memcpy polling tests... " << std::endl;
   {
     // hpx::cuda::experimental::enable_user_polling polling_scope;
-    hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
+    hpx::cuda::experimental::detail::register_polling(
+        hpx::resource::get_thread_pool(0));
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                     round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-        2, 0, true);
+                     cppuddle::executor_recycling::round_robin_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, true);
     test_pool_memcpy<hpx::cuda::experimental::cuda_executor,
-                     priority_pool<hpx::cuda::experimental::cuda_executor>>(
-        2, 0, true);
-    hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
+                     cppuddle::executor_recycling::priority_pool_impl<
+                         hpx::cuda::experimental::cuda_executor>>(2, 0, true);
+    hpx::cuda::experimental::detail::unregister_polling(
+        hpx::resource::get_thread_pool(0));
   }
-  recycler::force_cleanup();
+  cppuddle::memory_recycling::force_buffer_cleanup();
   std::cout << "Finished memcpy tests! " << std::endl;
   return hpx::finalize();
 }
diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp
index 07de4c44..63f25b27 100644
--- a/tests/stream_test.hpp
+++ b/tests/stream_test.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021 Gregor Daiß
+// Copyright (c) 2020-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -9,123 +9,182 @@
 #include <hpx/async_base/async.hpp>
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/buffer_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) {
-  std::vector<double, recycler::recycle_allocator_cuda_host<double>> hostbuffer(
-      512);
-  recycler::cuda_device_buffer<double> devicebuffer(512);
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) {
+  std::vector<double,
+              cppuddle::memory_recycling::recycle_allocator_cuda_host<double>>
+      hostbuffer(512);
+  cppuddle::memory_recycling::cuda_device_buffer<double> devicebuffer(512);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   // without interface wrapper
   {
-    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
+    auto test1 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
     Interface test1_interface = std::get<0>(test1);
     size_t interface_id = std::get<1>(test1);
-    hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
-                         hostbuffer.data(), 512 * sizeof(double),
-                         cudaMemcpyHostToDevice);
-    auto fut1 = hpx::async(test1_interface,
-        cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer,
-        512 * sizeof(double), cudaMemcpyDeviceToHost);
+    hpx::apply(test1_interface, cudaMemcpyAsync,
+               devicebuffer.device_side_buffer, hostbuffer.data(),
+               512 * sizeof(double), cudaMemcpyHostToDevice);
+    auto fut1 = hpx::async(test1_interface, cudaMemcpyAsync, hostbuffer.data(),
+                           devicebuffer.device_side_buffer,
+                           512 * sizeof(double), cudaMemcpyDeviceToHost);
     fut1.get();
-    stream_pool::release_interface<Interface, Pool>(interface_id, 0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        interface_id, 0);
   }
 
   // with interface wrapper
   {
-    stream_interface<Interface, Pool> test1_interface{0};
+    cppuddle::executor_recycling::executor_interface<Interface, Pool>
+        test1_interface{0};
     // hpx::cuda::cuda_executor test1_interface(0, false);
-    hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer,
-                         hostbuffer.data(), 512 * sizeof(double),
-                         cudaMemcpyHostToDevice);
-    auto fut1 = hpx::async(test1_interface.interface,
-        cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer,
-        512 * sizeof(double), cudaMemcpyDeviceToHost);
+    hpx::apply(test1_interface.interface, cudaMemcpyAsync,
+               devicebuffer.device_side_buffer, hostbuffer.data(),
+               512 * sizeof(double), cudaMemcpyHostToDevice);
+    auto fut1 = hpx::async(test1_interface.interface, cudaMemcpyAsync,
+                           hostbuffer.data(), devicebuffer.device_side_buffer,
+                           512 * sizeof(double), cudaMemcpyDeviceToHost);
     fut1.get();
   }
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) {
+void test_pool_ref_counting(const size_t executor_parameter, Ts &&...ts) {
 
   // init ppol
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   {
     // Allocating
-    auto test1 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load1 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test1 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load1 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load1 == 0);
     Interface test1_interface = std::get<0>(test1);
     size_t test1_index = std::get<1>(test1);
-    auto test2 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load2 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test2 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load2 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load2 == 1);
     Interface test2_interface = std::get<0>(test2);
     // auto fut = test2_interface.get_future();
     size_t test2_index = std::get<1>(test2);
-    auto test3 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load3 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test3 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load3 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load3 == 1);
     Interface test3_interface = std::get<0>(test3);
     size_t test3_index = std::get<1>(test3);
-    auto test4 = stream_pool::get_interface<Interface, Pool>(0);
-    auto load4 = stream_pool::get_current_load<Interface, Pool>(0);
+    auto test4 =
+        cppuddle::executor_recycling::executor_pool::get_interface<Interface,
+                                                                   Pool>(0);
+    auto load4 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     Interface test4_interface = std::get<0>(test4);
     size_t test4_index = std::get<1>(test4);
     assert(load4 == 2);
     // Releasing
-    stream_pool::release_interface<Interface, Pool>(test4_index, 0);
-    load4 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test4_index, 0);
+    load4 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load4 == 1);
-    stream_pool::release_interface<Interface, Pool>(test3_index, 0);
-    load3 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test3_index, 0);
+    load3 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load3 == 1);
-    stream_pool::release_interface<Interface, Pool>(test2_index, 0);
-    load2 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test2_index, 0);
+    load2 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load2 == 0);
-    stream_pool::release_interface<Interface, Pool>(test1_index, 0);
-    load1 = stream_pool::get_current_load<Interface, Pool>(0);
+    cppuddle::executor_recycling::executor_pool::release_interface<Interface,
+                                                                   Pool>(
+        test1_index, 0);
+    load1 =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load1 == 0);
   }
   // Clear
-  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
+  auto load0 =
+      cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                    Pool>(0);
   assert(load0 == 0);
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 template <typename Interface, typename Pool, typename... Ts>
-void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) {
-  using wrapper_type = stream_interface<Interface, Pool>;
+void test_pool_wrappers(const size_t executor_parameter, Ts &&...ts) {
+  using wrapper_type =
+      cppuddle::executor_recycling::executor_interface<Interface, Pool>;
   // init ppol
-  stream_pool::init<Interface, Pool>(stream_parameter, std::forward<Ts>(ts)...);
+  cppuddle::executor_recycling::executor_pool::init<Interface, Pool>(
+      executor_parameter, std::forward<Ts>(ts)...);
   {
     wrapper_type test1{0};
-    auto load = stream_pool::get_current_load<Interface, Pool>(0);
+    auto load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 0);
     wrapper_type test2{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     // auto fut = test2.get_future();
     assert(load == 1);
     wrapper_type test3{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 1);
     wrapper_type test4{0};
-    load = stream_pool::get_current_load<Interface, Pool>(0);
+    load =
+        cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                      Pool>(0);
     assert(load == 2);
     // Check availability method:
-    bool avail = stream_pool::interface_available<Interface, Pool>(1, 0);
+    bool avail =
+        cppuddle::executor_recycling::executor_pool::interface_available<
+            Interface, Pool>(1, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(2, 0);
+    avail = cppuddle::executor_recycling::executor_pool::interface_available<
+        Interface, Pool>(2, 0);
     assert(avail == false); // NOLINT
-    avail = stream_pool::interface_available<Interface, Pool>(3, 0);
+    avail = cppuddle::executor_recycling::executor_pool::interface_available<
+        Interface, Pool>(3, 0);
     assert(avail == true); // NOLINT
   }
-  auto load0 = stream_pool::get_current_load<Interface, Pool>(0);
+  auto load0 =
+      cppuddle::executor_recycling::executor_pool::get_current_load<Interface,
+                                                                    Pool>(0);
   assert(load0 == 0);
-  stream_pool::cleanup<Interface, Pool>();
+  cppuddle::executor_recycling::executor_pool::cleanup<Interface, Pool>();
 }
 
 #endif
diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp
index fed34626..d65c9668 100644
--- a/tests/work_aggregation_cpu_triad.cpp
+++ b/tests/work_aggregation_cpu_triad.cpp
@@ -5,9 +5,8 @@
 #include <hpx/futures/future.hpp>
 #undef NDEBUG
 
-
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
+#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 #include <boost/program_options.hpp>
 
@@ -101,7 +100,8 @@ int hpx_main(int argc, char *argv[]) {
   size_t number_underlying_executors{0};
   bool print_launch_counter{false};
   std::string executor_type_string{};
-  Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER};
+  cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{
+      cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
   std::string filename{};
   {
     try {
@@ -161,11 +161,11 @@ int hpx_main(int argc, char *argv[]) {
         return hpx::finalize();
       }
       if (executor_type_string == "EAGER") {
-        executor_mode = Aggregated_Executor_Modes::EAGER;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER;
       } else if (executor_type_string == "STRICT") {
-        executor_mode = Aggregated_Executor_Modes::STRICT;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT;
       } else if (executor_type_string == "ENDLESS") {
-        executor_mode = Aggregated_Executor_Modes::ENDLESS;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS;
       } else {
         std::cerr << "ERROR: Unknown executor mode " << executor_type_string
                   << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl;
@@ -180,11 +180,14 @@ int hpx_main(int argc, char *argv[]) {
     }
   }
 
-  stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(
+  cppuddle::executor_recycling::executor_pool::init<
+      Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>(
       number_underlying_executors);
   static const char kernelname[] = "cpu_triad";
-  using executor_pool = aggregation_pool<kernelname, Dummy_Executor,
-                                         round_robin_pool<Dummy_Executor>>;
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 
   using float_t = float;
@@ -290,7 +293,7 @@ int hpx_main(int argc, char *argv[]) {
   std::flush(hpx::cout);
   sleep(1);
 
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }
 
diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp
index f3f6ec92..75f7ad14 100644
--- a/tests/work_aggregation_cuda_triad.cpp
+++ b/tests/work_aggregation_cuda_triad.cpp
@@ -7,24 +7,25 @@
 //#undef NDEBUG
 
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
-
 #include <boost/program_options.hpp>
 
-
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 //===============================================================================
 //===============================================================================
 // Stream benchmark
 
 template <typename float_t>
-__global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_t *B, const float_t *C, const float_t scalar, const size_t start_id, const size_t kernel_size, const size_t problem_size) {
+__global__ void __launch_bounds__(1024, 2)
+    triad_kernel(float_t *A, const float_t *B, const float_t *C,
+                 const float_t scalar, const size_t start_id,
+                 const size_t kernel_size, const size_t problem_size) {
   const size_t i = start_id + blockIdx.x * blockDim.x + threadIdx.x;
   A[i] = B[i] + scalar * C[i];
 }
 
-
 //===============================================================================
 //===============================================================================
 int hpx_main(int argc, char *argv[]) {
@@ -37,7 +38,8 @@ int hpx_main(int argc, char *argv[]) {
   size_t number_underlying_executors{0};
   bool print_launch_counter{false};
   std::string executor_type_string{};
-  Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER};
+  cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{
+      cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
   std::string filename{};
   {
     try {
@@ -97,11 +99,11 @@ int hpx_main(int argc, char *argv[]) {
         return hpx::finalize();
       }
       if (executor_type_string == "EAGER") {
-        executor_mode = Aggregated_Executor_Modes::EAGER;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER;
       } else if (executor_type_string == "STRICT") {
-        executor_mode = Aggregated_Executor_Modes::STRICT;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT;
       } else if (executor_type_string == "ENDLESS") {
-        executor_mode = Aggregated_Executor_Modes::ENDLESS;
+        executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS;
       } else {
         std::cerr << "ERROR: Unknown executor mode " << executor_type_string
                   << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl;
@@ -119,11 +121,14 @@ int hpx_main(int argc, char *argv[]) {
   hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
 
   using executor_t = hpx::cuda::experimental::cuda_executor;
-  stream_pool::init<executor_t, round_robin_pool<executor_t>>(
+  cppuddle::executor_recycling::executor_pool::init<
+      executor_t,
+      cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(
       number_underlying_executors, 0, true);
   static const char kernelname2[] = "cuda_triad";
-  using executor_pool = aggregation_pool<kernelname2, executor_t,
-                                         round_robin_pool<executor_t>>;
+  using executor_pool = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname2, executor_t,
+      cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>;
   executor_pool::init(number_aggregation_executors, max_slices, executor_mode);
 
   using float_t = float;
@@ -147,9 +152,9 @@ int hpx_main(int argc, char *argv[]) {
     std::vector<float_t> A(problem_size, 0.0);
     std::vector<float_t> B(problem_size, 2.0);
     std::vector<float_t> C(problem_size, 1.0);
-    recycler::cuda_device_buffer<float_t> device_A(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_B(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_C(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_A(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_B(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_C(problem_size, 0);
     cudaMemcpy(device_A.device_side_buffer, A.data(),
                problem_size * sizeof(float_t), cudaMemcpyHostToDevice);
     cudaMemcpy(device_B.device_side_buffer, B.data(),
@@ -196,17 +201,16 @@ int hpx_main(int argc, char *argv[]) {
                   auto slice_exec = fut.get();
 
                   auto alloc_host = slice_exec.template make_allocator<
-                      float_t, recycler::detail::cuda_pinned_allocator<float_t>>();
+                      float_t, cppuddle::memory_recycling::detail::cuda_pinned_allocator<float_t>>();
                   auto alloc_device = slice_exec.template make_allocator<
-                      float_t, recycler::detail::cuda_device_allocator<float_t>>();
+                      float_t, cppuddle::memory_recycling::detail::cuda_device_allocator<float_t>>();
 
                   // Start the actual task
 
-                  // todo -- one slice gets a buffer that's not vaild anymore
                   std::vector<float_t, decltype(alloc_host)> local_A(
                       slice_exec.number_slices * kernel_size, float_t{}, alloc_host);
 
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_A(slice_exec.number_slices * kernel_size, 
                                alloc_device);
@@ -214,7 +218,7 @@ int hpx_main(int argc, char *argv[]) {
                   std::vector<float_t, decltype(alloc_host)> local_B(
                       slice_exec.number_slices * kernel_size, float_t{},
                       alloc_host);
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_B(slice_exec.number_slices * kernel_size, 
                                alloc_device);
@@ -222,7 +226,7 @@ int hpx_main(int argc, char *argv[]) {
                   std::vector<float_t, decltype(alloc_host)> local_C(
                       slice_exec.number_slices * kernel_size, float_t{},
                       alloc_host);
-                  recycler::cuda_aggregated_device_buffer<float_t,
+                  cppuddle::memory_recycling::cuda_aggregated_device_buffer<float_t,
                                                           decltype(alloc_device)>
                       device_C(slice_exec.number_slices * kernel_size,
                                alloc_device);
@@ -317,9 +321,9 @@ int hpx_main(int argc, char *argv[]) {
     std::vector<float_t> A(problem_size, 0.0);
     std::vector<float_t> B(problem_size, 2.0);
     std::vector<float_t> C(problem_size, 1.0);
-    recycler::cuda_device_buffer<float_t> device_A(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_B(problem_size, 0);
-    recycler::cuda_device_buffer<float_t> device_C(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_A(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_B(problem_size, 0);
+    cppuddle::memory_recycling::cuda_device_buffer<float_t> device_C(problem_size, 0);
     cudaMemcpy(device_A.device_side_buffer, A.data(),
                problem_size * sizeof(float_t), cudaMemcpyHostToDevice);
     cudaMemcpy(device_B.device_side_buffer, B.data(),
@@ -417,7 +421,7 @@ int hpx_main(int argc, char *argv[]) {
   /* sleep(1); */
 
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }
 
diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp
index 25455633..abe827f4 100644
--- a/tests/work_aggregation_test.cpp
+++ b/tests/work_aggregation_test.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2022 Gregor Daiß
+// Copyright (c) 2022-2024 Gregor Daiß
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -9,11 +9,16 @@
 #include <hpx/async_base/async.hpp>
 #include <hpx/execution_base/execution.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
-#include "../include/aggregation_manager.hpp"
-#include "../include/cuda_buffer_util.hpp"
 
 #include <boost/program_options.hpp>
 
+#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"
+#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp"
+#include "cppuddle/executor_recycling/executor_pools_interface.hpp""
+#define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are 
+                                  // compatible across all participating tasks
+                                  // Must be defined before including the aggregation:
+#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp"
 
 //===============================================================================
 //===============================================================================
@@ -114,9 +119,11 @@ namespace hpx { namespace parallel { namespace execution {
 
 void sequential_test(void) {
   static const char kernelname[] = "kernel1";
-  using kernel_pool1 = aggregation_pool<kernelname, Dummy_Executor,
-                                        round_robin_pool<Dummy_Executor>>;
-  kernel_pool1::init(8, 2, Aggregated_Executor_Modes::STRICT);
+  using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
+  kernel_pool1::init(
+      8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   // Sequential test
   hpx::cout << "Sequential test with all executor slices" << std::endl;
   hpx::cout << "----------------------------------------" << std::endl;
@@ -260,8 +267,8 @@ void interruption_test(void) {
   hpx::cout << "Sequential test with interruption:" << std::endl;
   hpx::cout << "----------------------------------" << std::endl;
   {
-    Aggregated_Executor<Dummy_Executor> agg_exec{
-        4, Aggregated_Executor_Modes::EAGER};
+    cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor> agg_exec{
+        4, cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER};
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 
     auto slice_fut1 = agg_exec.request_executor_slice();
@@ -326,8 +333,8 @@ void failure_test(bool type_error) {
   hpx::cout << "------------------------------------------------------"
             << std::endl;
   {
-    Aggregated_Executor<Dummy_Executor> agg_exec{
-        4, Aggregated_Executor_Modes::STRICT};
+    cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor> agg_exec{
+        4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT};
 
     auto slice_fut1 = agg_exec.request_executor_slice();
 
@@ -405,9 +412,11 @@ void pointer_add_test(void) {
   hpx::cout << "--------------------------------------------------------"
             << std::endl;
   static const char kernelname2[] = "kernel2";
-  using kernel_pool2 = aggregation_pool<kernelname2, Dummy_Executor,
-                                        round_robin_pool<Dummy_Executor>>;
-  kernel_pool2::init(8, 2, Aggregated_Executor_Modes::STRICT);
+  using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool<
+      kernelname2, Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>;
+  kernel_pool2::init(
+      8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   {
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
@@ -602,10 +611,12 @@ void references_add_test(void) {
   {
     /*Aggregated_Executor<decltype(executor1)> agg_exec{
         4, Aggregated_Executor_Modes::STRICT};*/
-    auto &agg_exec =
-        std::get<0>(stream_pool::get_interface<
-                    Aggregated_Executor<Dummy_Executor>,
-                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(0));
+    auto &agg_exec = std::get<0>(
+        cppuddle::executor_recycling::executor_pool::get_interface<
+            cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
+            cppuddle::executor_recycling::round_robin_pool_impl<
+                cppuddle::kernel_aggregation::aggregated_executor<
+                    Dummy_Executor>>>(0));
     std::vector<float> erg(512);
     std::vector<hpx::lcos::future<void>> slices_done_futs;
 
@@ -826,14 +837,19 @@ int hpx_main(int argc, char *argv[]) {
     return hpx::finalize();
   }
 
-  stream_pool::init<hpx::cuda::experimental::cuda_executor,
-                    round_robin_pool<hpx::cuda::experimental::cuda_executor>>(
-      8, 0, false);
-  stream_pool::init<Dummy_Executor, round_robin_pool<Dummy_Executor>>(8);
-
-  stream_pool::init<Aggregated_Executor<Dummy_Executor>,
-                    round_robin_pool<Aggregated_Executor<Dummy_Executor>>>(
-      8, 4, Aggregated_Executor_Modes::STRICT);
+  cppuddle::executor_recycling::executor_pool::init<
+      hpx::cuda::experimental::cuda_executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<
+          hpx::cuda::experimental::cuda_executor>>(8, 0, false);
+  cppuddle::executor_recycling::executor_pool::init<
+      Dummy_Executor,
+      cppuddle::executor_recycling::round_robin_pool_impl<Dummy_Executor>>(8);
+
+  cppuddle::executor_recycling::executor_pool::init<
+      cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>,
+      cppuddle::executor_recycling::round_robin_pool_impl<
+          cppuddle::kernel_aggregation::aggregated_executor<Dummy_Executor>>>(
+      8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT);
   /*hpx::cuda::experimental::cuda_executor executor1 =
       std::get<0>(stream_pool::get_interface<
                   hpx::cuda::experimental::cuda_executor,
@@ -863,8 +879,8 @@ int hpx_main(int argc, char *argv[]) {
   std::flush(hpx::cout);
   sleep(1);
 
-  recycler::print_performance_counters();
-  recycler::force_cleanup(); // Cleanup all buffers and the managers
+  cppuddle::memory_recycling::print_buffer_counters();
+  cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers
   return hpx::finalize();
 }