diff --git a/.gitignore b/.gitignore index 2b25a731..bdc68a8d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ spack.lock .clangd docs compile_commands.json +spack-build* +spack-configure-args.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a31995f..2212d40b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,6 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR} # GPU-related options option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) -option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.") # HPX-related options @@ -151,6 +150,9 @@ endif() # Define library targets and installation # (also includes various warnings for non-optimal build configurations) +# TODO Cleanup targets: +# this is leftover from the days where cppuddle was not header-only + ## Interface targets add_library(buffer_manager INTERFACE) if (CPPUDDLE_WITH_HPX) @@ -319,9 +321,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_test.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) add_hpx_executable( @@ -331,9 +330,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_cpu_triad.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) add_hpx_executable( @@ -343,9 +339,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_cuda_triad.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) target_compile_definitions(work_aggregation_test PRIVATE CPPUDDLE_HAVE_CUDA) endif() # end WITH KOKKOS @@ -359,11 +352,6 @@ if (CPPUDDLE_WITH_TESTS) add_compile_definitions(CPPUDDLE_WITH_HPX) endif() - if (CPPUDDLE_WITH_MULTIGPU_SUPPORT) - add_compile_definitions(CPPUDDLE_HAVE_MULTIGPU) - message(WARNING, " Multi-GPU Support not yet properly tested!") - endif() - #------------------------------------------------------------------------------------------------------------ # Define actual tests (usually running the binary and checking its output for certain patterns via regex) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 2aa92063..9b546cab 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -1,1151 +1,39 @@ -// Copyright (c) 2022-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef WORK_AGGREGATION_MANAGER -#define WORK_AGGREGATION_MANAGER +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality -#ifndef CPPUDDLE_HAVE_HPX -#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON" -#endif - -#include -//#define DEBUG_AGGREGATION_CALLS 1 - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) -// required for defining type traits using cuda executor as underlying -// aggregation executors -#include -#endif - -#include -#include - -#include "../include/buffer_manager.hpp" -#include "../include/stream_manager.hpp" -#include "../include/detail/config.hpp" - -#ifndef CPPUDDLE_HAVE_HPX_MUTEX -#pragma message \ - "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF" -#pragma message \ - "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex" -#endif -namespace recycler { - using aggregation_mutex_t = hpx::mutex; -} - -//=============================================================================== -//=============================================================================== -// Helper functions/classes - -/// Constructs a tuple with copies (to store temporaries in aggregated function -/// calls) yet also supporting references (on the users own risk...) -template -std::tuple make_tuple_supporting_references(Ts &&...ts) { - return std::tuple{std::forward(ts)...}; -} - -/// Print some specific values that we can, but don't bother for most types -/// (such as vector) -template std::string print_if_possible(T val) { - if constexpr (std::is_convertible_v) { - return val; - } else if constexpr (std::is_integral_v || std::is_floating_point_v) { - return std::to_string(val); - } else if constexpr (std::is_pointer_v) { - // Pretty printing pointer sort of only works well with %p - // TODO Try using std::format as soon as we can move to C++20 - std::unique_ptr debug_string(new char[128]()); - snprintf(debug_string.get(), 128, "%p", val); - return std::string(debug_string.get()); - } else { - return std::string("cannot print value"); - } -} - -/// Helper class for the helper class that prints tuples -- do not use this -/// directly -template -void print_tuple(const TupType &_tup, std::index_sequence) { - (..., (hpx::cout << (I == 0 ? "" : ", ") - << print_if_possible(std::get(_tup)))); -} - -/// Helper class for printing tuples (first component should be a function -/// pointer, remaining components the function arguments) -template void print_tuple(const std::tuple &_tup) { - // Use pointer and sprintf as boost::format refused to NOT cast the pointer - // address to 1... - // TODO Try using std::format as soon as we can move to C++20 - std::unique_ptr debug_string(new char[128]()); - snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (", - std::get<0>(_tup)); - hpx::cout << debug_string.get(); - print_tuple(_tup, std::make_index_sequence()); - hpx::cout << ")"; -} - -//=============================================================================== -//=============================================================================== -template -void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) { - hpx::apply(exec, std::forward(f), std::forward(ts)...); -} - -template -hpx::lcos::future exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) { - return hpx::async(exec, std::forward(f), std::forward(ts)...); -} - -/// Manages the launch conditions for aggregated function calls -/// type/value-errors -/** Launch conditions: All slice executors must have called the same function - * (tracked by future all_slices_ready) - * AND - * Previous aggregated_function_call on the same Executor must have been - * launched (tracked by future stream_future) - * All function calls received from the slice executors are checked if they - * match the first one in both types and values (throws exception otherwise) - */ - -template class aggregated_function_call { -private: - std::atomic slice_counter = 0; - - /// Promise to be set when all slices have visited this function call - /* hpx::lcos::local::promise slices_ready_promise; */ - /// Tracks if all slices have visited this function call - /* hpx::lcos::future all_slices_ready = slices_ready_promise.get_future(); */ - /// How many slices can we expect? - const size_t number_slices; - const bool async_mode; - - Executor &underlying_executor; - -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) -#pragma message \ - "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..." - /// Stores the function call of the first slice as reference for error - /// checking - std::any function_tuple; - /// Stores the string of the first function call for debug output - std::string debug_type_information; - recycler::aggregation_mutex_t debug_mut; -#endif - - std::vector> potential_async_promises{}; - -public: - aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec) - : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) { - if (async_mode) - potential_async_promises.resize(number_slices); - } - ~aggregated_function_call(void) { - // All slices should have done this call - assert(slice_counter == number_slices); - // assert(!all_slices_ready.valid()); - } - /// Returns true if all required slices have visited this point - bool sync_aggregation_slices(hpx::lcos::future &stream_future) { - assert(!async_mode); - assert(potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - if (local_counter == number_slices - 1) { - return true; - } - else return false; - } - template - void post_when(hpx::lcos::future &stream_future, F &&f, Ts &&...ts) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - // needed for concurrent access to function_tuple and debug_type_information - // Not required for normal use - std::lock_guard guard(debug_mut); -#endif - assert(!async_mode); - assert(potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - - if (local_counter == 0) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto tmp_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - function_tuple = tmp_tuple; - debug_type_information = typeid(decltype(tmp_tuple)).name(); -#endif - - } else { - // - // This scope checks if both the type and the values of the current call - // match the original call To be used in debug build... - // -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto comparison_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - try { - auto orig_call_tuple = - std::any_cast(function_tuple); - if (comparison_tuple != orig_call_tuple) { - throw std::runtime_error( - "Values of post function arguments (or function " - "itself) do not match "); - } - } catch (const std::bad_any_cast &e) { - hpx::cout - << "\nMismatched types error in aggregated post call of executor " - << ": " << e.what() << "\n"; - hpx::cout << "Expected types:\t\t " - << boost::core::demangle(debug_type_information.c_str()); - hpx::cout << "\nGot types:\t\t " - << boost::core::demangle( - typeid(decltype(comparison_tuple)).name()) - << "\n" - << std::endl; - // throw; - } catch (const std::runtime_error &e) { - hpx::cout - << "\nMismatched values error in aggregated post call of executor " - << ": " << e.what() << std::endl; - hpx::cout << "Types (matched):\t " - << boost::core::demangle(debug_type_information.c_str()); - auto orig_call_tuple = - std::any_cast(function_tuple); - hpx::cout << "\nExpected values:\t "; - print_tuple(orig_call_tuple); - hpx::cout << "\nGot values:\t\t "; - print_tuple(comparison_tuple); - hpx::cout << std::endl << std::endl; - // throw; - } -#endif - } - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - // Check exit criteria: Launch function call continuation by setting the - // slices promise - if (local_counter == number_slices - 1) { - exec_post_wrapper(underlying_executor, std::forward(f), std::forward(ts)...); - //slices_ready_promise.set_value(); - } - } - template - hpx::lcos::future async_when(hpx::lcos::future &stream_future, - F &&f, Ts &&...ts) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - // needed for concurrent access to function_tuple and debug_type_information - // Not required for normal use - std::lock_guard guard(debug_mut); -#endif - assert(async_mode); - assert(!potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - if (local_counter == 0) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto tmp_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - function_tuple = tmp_tuple; - debug_type_information = typeid(decltype(tmp_tuple)).name(); -#endif - } else { - // - // This scope checks if both the type and the values of the current call - // match the original call To be used in debug build... - // -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto comparison_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - try { - auto orig_call_tuple = - std::any_cast(function_tuple); - if (comparison_tuple != orig_call_tuple) { - throw std::runtime_error( - "Values of async function arguments (or function " - "itself) do not match "); - } - } catch (const std::bad_any_cast &e) { - hpx::cout - << "\nMismatched types error in aggregated async call of executor " - << ": " << e.what() << "\n"; - hpx::cout << "Expected types:\t\t " - << boost::core::demangle(debug_type_information.c_str()); - hpx::cout << "\nGot types:\t\t " - << boost::core::demangle( - typeid(decltype(comparison_tuple)).name()) - << "\n" - << std::endl; - // throw; - } catch (const std::runtime_error &e) { - hpx::cout - << "\nMismatched values error in aggregated async call of executor " - << ": " << e.what() << std::endl; - hpx::cout << "Types (matched):\t " - << boost::core::demangle(debug_type_information.c_str()); - auto orig_call_tuple = - std::any_cast(function_tuple); - hpx::cout << "\nExpected values:\t "; - print_tuple(orig_call_tuple); - hpx::cout << "\nGot values:\t\t "; - print_tuple(comparison_tuple); - hpx::cout << std::endl << std::endl; - // throw; - } -#endif - } - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - assert(potential_async_promises.size() == number_slices); - hpx::lcos::future ret_fut = - potential_async_promises[local_counter].get_future(); - if (local_counter == number_slices - 1) { - /* slices_ready_promise.set_value(); */ - auto fut = exec_async_wrapper( - underlying_executor, std::forward(f), std::forward(ts)...); - fut.then([this](auto &&fut) { - for (auto &promise : potential_async_promises) { - promise.set_value(); - } - }); - } - // Check exit criteria: Launch function call continuation by setting the - // slices promise - return ret_fut; - } - template - hpx::lcos::shared_future wrap_async(hpx::lcos::future &stream_future, - F &&f, Ts &&...ts) { - assert(async_mode); - assert(!potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - assert(potential_async_promises.size() == number_slices); - hpx::lcos::shared_future ret_fut = - potential_async_promises[local_counter].get_shared_future(); - if (local_counter == number_slices - 1) { - auto fut = f(std::forward(ts)...); - fut.then([this](auto &&fut) { - // TODO just use one promise - for (auto &promise : potential_async_promises) { - promise.set_value(); - } - }); - } - return ret_fut; - } - // We need to be able to copy or no-except move for std::vector.. - aggregated_function_call(const aggregated_function_call &other) = default; - aggregated_function_call & - operator=(const aggregated_function_call &other) = default; - aggregated_function_call(aggregated_function_call &&other) = default; - aggregated_function_call & - operator=(aggregated_function_call &&other) = default; -}; - -//=============================================================================== -//=============================================================================== - -enum class Aggregated_Executor_Modes { EAGER = 1, STRICT, ENDLESS }; -/// Declaration since the actual allocator is only defined after the Executors -template -class Allocator_Slice; - -/// Executor Class that aggregates function calls for specific kernels -/** Executor is not meant to be used directly. Instead it yields multiple - * Executor_Slice objects. These serve as interfaces. Slices from the same - * Aggregated_Executor are meant to execute the same function calls but on - * different data (i.e. different tasks) - */ -template class Aggregated_Executor { -private: - //=============================================================================== - // Misc private avariables: - // - std::atomic slices_exhausted; - - std::atomic executor_slices_alive; - std::atomic buffers_in_use; - std::atomic dealloc_counter; - - const Aggregated_Executor_Modes mode; - const size_t max_slices; - std::atomic current_slices; - /// Wrapper to the executor interface from the stream pool - /// Automatically hooks into the stream_pools reference counting - /// for cpu/gpu load balancing - std::unique_ptr>> executor_wrapper; - -public: - size_t gpu_id; - // Subclasses - - /// Slice class - meant as a scope interface to the aggregated executor - class Executor_Slice { - public: - Aggregated_Executor &parent; - private: - /// Executor is a slice of this aggregated_executor - /// How many functions have been called - required to enforce sequential - /// behaviour of kernel launches - size_t launch_counter{0}; - size_t buffer_counter{0}; - bool notify_parent_about_destruction{true}; - - public: - /// How many slices are there overall - required to check the launch - /// criteria - const size_t number_slices; - const size_t id; - using executor_t = Executor; - Executor_Slice(Aggregated_Executor &parent, const size_t slice_id, - const size_t number_slices) - : parent(parent), notify_parent_about_destruction(true), - number_slices(number_slices), id(slice_id) { - } - ~Executor_Slice(void) { - // Don't notify parent if we moved away from this executor_slice - if (notify_parent_about_destruction) { - // Executor should be done by the time of destruction - // -> check here before notifying parent - - // parent still in execution mode? - assert(parent.slices_exhausted == true); - // all kernel launches done? - assert(launch_counter == parent.function_calls.size()); - // Notifiy parent that this aggregation slice is one - parent.reduce_usage_counter(); - } - } - Executor_Slice(const Executor_Slice &other) = delete; - Executor_Slice &operator=(const Executor_Slice &other) = delete; - Executor_Slice(Executor_Slice &&other) - : parent(other.parent), launch_counter(std::move(other.launch_counter)), - buffer_counter(std::move(other.buffer_counter)), - number_slices(std::move(other.number_slices)), - id(std::move(other.id)) { - other.notify_parent_about_destruction = false; - } - Executor_Slice &operator=(Executor_Slice &&other) { - parent = other.parent; - launch_counter = std::move(other.launch_counter); - buffer_counter = std::move(other.buffer_counter); - number_slices = std::move(other.number_slices); - id = std::move(other.id); - other.notify_parent_about_destruction = false; - } - template - Allocator_Slice make_allocator() { - return Allocator_Slice(*this); - } - bool sync_aggregation_slices() { - assert(parent.slices_exhausted == true); - auto ret = parent.sync_aggregation_slices(launch_counter); - launch_counter++; - return ret; - } - template void post(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - parent.post(launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - } - template - hpx::lcos::future async(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - hpx::lcos::future ret_fut = parent.async( - launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - return ret_fut; - } - - // OneWay Execution - template - friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t, - Executor_Slice& exec, F&& f, Ts&&... ts) - { - return exec.post(std::forward(f), std::forward(ts)...); - } - - // TwoWay Execution - template - friend decltype(auto) tag_invoke( - hpx::parallel::execution::async_execute_t, Executor_Slice& exec, - F&& f, Ts&&... ts) - { - return exec.async( - std::forward(f), std::forward(ts)...); - } +#ifndef AGGREGATION_MANAGER_HPP +#define AGGREGATION_MANAGER_HPP - template - hpx::lcos::shared_future wrap_async(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - hpx::lcos::shared_future ret_fut = parent.wrap_async( - launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - return ret_fut; - } +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" - /// Get new aggregated buffer (might have already been allocated been - /// allocated by different slice) - template T *get(const size_t size) { - assert(parent.slices_exhausted == true); - T *aggregated_buffer = - parent.get(size, buffer_counter); - buffer_counter++; - assert(buffer_counter > 0); - return aggregated_buffer; - } - - Executor& get_underlying_executor(void) { - assert(parent.executor_wrapper); - return *(parent.executor_wrapper); - } - }; - - //=============================================================================== - - hpx::lcos::local::promise slices_full_promise; - /// Promises with the slice executors -- to be set when the starting criteria - /// is met - std::vector> executor_slices; - /// List of aggregated function calls - function will be launched when all - /// slices have called it - std::deque> function_calls; - /// For synchronizing the access to the function calls list - recycler::aggregation_mutex_t mut; - - /// Data entry for a buffer allocation: void* pointer, size_t for - /// buffer-size, atomic for the slice counter, location_id, gpu_id - using buffer_entry_t = - std::tuple, bool, const size_t, size_t>; - /// Keeps track of the aggregated buffer allocations done in all the slices - std::deque buffer_allocations; - /// Map pointer to deque index for fast access in the deallocations - std::unordered_map buffer_allocations_map; - /// For synchronizing the access to the buffer_allocations - recycler::aggregation_mutex_t buffer_mut; - std::atomic buffer_counter = 0; - - /// Get new buffer OR get buffer already allocated by different slice - template - T *get(const size_t size, const size_t slice_alloc_counter) { - assert(slices_exhausted == true); - assert(executor_wrapper); - assert(executor_slices_alive == true); - // Add aggreated buffer entry in case it hasn't happened yet for this call - // First: Check if it already has happened - if (buffer_counter <= slice_alloc_counter) { - // we might be the first! Lock... - std::lock_guard guard(buffer_mut); - // ... and recheck - if (buffer_counter <= slice_alloc_counter) { - constexpr bool manage_content_lifetime = false; - buffers_in_use = true; - - // Default location -- useful for GPU builds as we otherwise create way too - // many different buffers for different aggregation sizes on different GPUs - /* size_t location_id = gpu_id * instances_per_gpu; */ - // Use integer conversion to only use 0 16 32 ... as buckets - size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; -#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - if (max_slices == 1) { - // get prefered location: aka the current hpx threads location - // Usually handy for CPU builds where we want to use the buffers - // close to the current CPU core - /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ - /* location_id = (gpu_id) * instances_per_gpu; */ - // division makes sure that we always use the same instance to store our gpu buffers. - } -#endif - // Get shiny and new buffer that will be shared between all slices - // Buffer might be recycled from previous allocations by the - // buffer_recycler... - T *aggregated_buffer = - recycler::detail::buffer_recycler::get( - size, manage_content_lifetime, location_id, gpu_id); - // Create buffer entry for this buffer - buffer_allocations.emplace_back(static_cast(aggregated_buffer), - size, 1, true, location_id, gpu_id); - -#ifndef NDEBUG - // if previousely used the buffer should not be in usage anymore - const auto exists = buffer_allocations_map.count( - static_cast(aggregated_buffer)); - if (exists > 0) { - const auto previous_usage_id = - buffer_allocations_map[static_cast(aggregated_buffer)]; - const auto &valid = - std::get<3>(buffer_allocations[previous_usage_id]); - assert(!valid); - } -#endif - buffer_allocations_map.insert_or_assign(static_cast(aggregated_buffer), - buffer_counter); - - assert (buffer_counter == slice_alloc_counter); - buffer_counter = buffer_allocations.size(); - - // Return buffer - return aggregated_buffer; - } - } - assert(buffers_in_use == true); - assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid - assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1); - - // Buffer entry should already exist: - T *aggregated_buffer = static_cast( - std::get<0>(buffer_allocations[slice_alloc_counter])); - // Error handling: Size is wrong? - assert(size == std::get<1>(buffer_allocations[slice_alloc_counter])); - // Notify that one more slice has visited this buffer allocation - std::get<2>(buffer_allocations[slice_alloc_counter])++; - return aggregated_buffer; - } - - /// Notify buffer list that one slice is done with the buffer - template - void mark_unused(T *p, const size_t size) { - assert(slices_exhausted == true); - assert(executor_wrapper); - - void *ptr_key = static_cast(p); - size_t slice_alloc_counter = buffer_allocations_map[p]; - - assert(slice_alloc_counter < buffer_allocations.size()); - /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = - buffer_allocations[slice_alloc_counter];*/ - auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); - const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); - auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); - auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); - const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); - const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]); - assert(valid); - T *buffer_pointer = static_cast(buffer_pointer_void); - - assert(buffer_size == size); - assert(p == buffer_pointer); - // assert(buffer_pointer == p || buffer_pointer == nullptr); - // Slice is done with this buffer - buffer_allocation_counter--; - // Check if all slices are done with this buffer? - if (buffer_allocation_counter == 0) { - // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage - std::lock_guard guard(buffer_mut); - // Only mark unused if another buffer has not done so already (and marked - // it as invalid) - if (valid) { - assert(buffers_in_use == true); - recycler::detail::buffer_recycler::mark_unused( - buffer_pointer, buffer_size, location_id, gpu_id); - // mark buffer as invalid to prevent any other slice from marking the - // buffer as unused - valid = false; - - const size_t current_deallocs = ++dealloc_counter; - if (current_deallocs == buffer_counter) { - std::lock_guard guard(mut); - buffers_in_use = false; - if (!executor_slices_alive && !buffers_in_use) { - slices_exhausted = false; - // Release executor - executor_wrapper.reset(nullptr); - } - } - } - } - } - - //=============================================================================== - // Public Interface -public: - hpx::lcos::future current_continuation; - hpx::lcos::future last_stream_launch_done; - std::atomic overall_launch_counter = 0; - - /// Only meant to be accessed by the slice executors - bool sync_aggregation_slices(const size_t slice_launch_counter) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, false, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].sync_aggregation_slices( - last_stream_launch_done); - } - } - - return function_calls[slice_launch_counter].sync_aggregation_slices( - last_stream_launch_done); - } - - /// Only meant to be accessed by the slice executors - template - void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, false, *executor_wrapper); - overall_launch_counter = function_calls.size(); - function_calls[slice_launch_counter].post_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - return; - } - } - - function_calls[slice_launch_counter].post_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - return; - } - - /// Only meant to be accessed by the slice executors - template - hpx::lcos::future async(const size_t slice_launch_counter, F &&f, - Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, true, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].async_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - } - - return function_calls[slice_launch_counter].async_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - /// Only meant to be accessed by the slice executors - template - hpx::lcos::shared_future wrap_async(const size_t slice_launch_counter, F &&f, - Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, true, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].wrap_async( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - } - - return function_calls[slice_launch_counter].wrap_async( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - - bool slice_available(void) { - std::lock_guard guard(mut); - return !slices_exhausted; - } - - std::optional> request_executor_slice() { - std::lock_guard guard(mut); - if (!slices_exhausted) { - const size_t local_slice_id = ++current_slices; - if (local_slice_id == 1) { - // Cleanup leftovers from last run if any - // TODO still required? Should be clean here already - function_calls.clear(); - overall_launch_counter = 0; - std::lock_guard guard(buffer_mut); -#ifndef NDEBUG - for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, - buffer_allocation_counter, valid, location_id, device_id] = - buffer_entry; - assert(!valid); - } -#endif - buffer_allocations.clear(); - buffer_allocations_map.clear(); - buffer_counter = 0; - - assert(executor_slices_alive == false); - assert(buffers_in_use == false); - executor_slices_alive = true; - buffers_in_use = false; - dealloc_counter = 0; - - if (mode == Aggregated_Executor_Modes::STRICT ) { - slices_full_promise = hpx::lcos::local::promise{}; - } - } - - // Create Executor Slice future -- that will be returned later - hpx::lcos::future ret_fut; - if (local_slice_id < max_slices) { - executor_slices.emplace_back(hpx::lcos::local::promise{}); - ret_fut = - executor_slices[local_slice_id - 1].get_future(); - } else { - launched_slices = current_slices; - ret_fut = hpx::make_ready_future(Executor_Slice{*this, - executor_slices.size(), launched_slices}); - } - - // Are we the first slice? If yes, add continuation set the - // Executor_Slice - // futures to ready if the launch conditions are met - if (local_slice_id == 1) { - // Redraw executor - assert(!executor_wrapper); - stream_pool::select_device>(gpu_id); - executor_wrapper.reset( - new stream_interface>(gpu_id)); - // Renew promise that all slices will be ready as the primary launch - // criteria... - hpx::lcos::shared_future fut; - if (mode == Aggregated_Executor_Modes::EAGER || - mode == Aggregated_Executor_Modes::ENDLESS) { - // Fallback launch condidtion: Launch as soon as the underlying stream - // is ready - /* auto slices_full_fut = slices_full_promise.get_future(); */ - stream_pool::select_device>(gpu_id); - auto exec_fut = (*executor_wrapper).get_future(); - /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */ - fut = std::move(exec_fut); - } else { - auto slices_full_fut = slices_full_promise.get_shared_future(); - // Just use the slices launch condition - fut = std::move(slices_full_fut); - } - // Launch all executor slices within this continuation - current_continuation = fut.then([this](auto &&fut) { - std::lock_guard guard(mut); - slices_exhausted = true; - launched_slices = current_slices; - size_t id = 0; - for (auto &slice_promise : executor_slices) { - slice_promise.set_value( - Executor_Slice{*this, id, launched_slices}); - id++; - } - executor_slices.clear(); - }); - } - if (local_slice_id >= max_slices && - mode != Aggregated_Executor_Modes::ENDLESS) { - slices_exhausted = true; // prevents any more threads from entering - // before the continuation is launched - /* launched_slices = current_slices; */ - /* size_t id = 0; */ - /* for (auto &slice_promise : executor_slices) { */ - /* slice_promise.set_value( */ - /* Executor_Slice{*this, id, launched_slices}); */ - /* id++; */ - /* } */ - /* executor_slices.clear(); */ - if (mode == Aggregated_Executor_Modes::STRICT ) { - slices_full_promise.set_value(); // Trigger slices launch condition continuation - } - // that continuation will set all executor slices so far handed out to ready - } - return ret_fut; - } else { - // Return empty optional as failure - return std::optional>{}; - } - } - size_t launched_slices; - void reduce_usage_counter(void) { - /* std::lock_guard guard(mut); */ - assert(slices_exhausted == true); - assert(executor_wrapper); - assert(executor_slices_alive == true); - assert(launched_slices >= 1); - assert(current_slices >= 0 && current_slices <= launched_slices); - const size_t local_slice_id = --current_slices; - // Last slice goes out scope? - if (local_slice_id == 0) { - // Mark executor fit for reusage - std::lock_guard guard(mut); - executor_slices_alive = false; - if (!executor_slices_alive && !buffers_in_use) { - // Release executor - slices_exhausted = false; - executor_wrapper.reset(nullptr); - } - } - } - ~Aggregated_Executor(void) { - - assert(current_slices == 0); - assert(executor_slices_alive == false); - assert(buffers_in_use == false); - - if (mode != Aggregated_Executor_Modes::STRICT ) { - slices_full_promise.set_value(); // Trigger slices launch condition continuation - } - - // Cleanup leftovers from last run if any - function_calls.clear(); - overall_launch_counter = 0; -#ifndef NDEBUG - for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, - valid, location_id, device_id] = buffer_entry; - assert(!valid); - } -#endif - buffer_allocations.clear(); - buffer_allocations_map.clear(); - buffer_counter = 0; - - assert(buffer_allocations.empty()); - assert(buffer_allocations_map.empty()); - } - - Aggregated_Executor(const size_t number_slices, - Aggregated_Executor_Modes mode, const size_t gpu_id = 0) - : max_slices(number_slices), current_slices(0), slices_exhausted(false), - dealloc_counter(0), mode(mode), executor_slices_alive(false), - buffers_in_use(false), gpu_id(gpu_id), - executor_wrapper(nullptr), - current_continuation(hpx::make_ready_future()), - last_stream_launch_done(hpx::make_ready_future()) {} - // Not meant to be copied or moved - Aggregated_Executor(const Aggregated_Executor &other) = delete; - Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete; - Aggregated_Executor(Aggregated_Executor &&other) = delete; - Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete; -}; +using Aggregated_Executor_Modes + [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor_modes " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::aggregated_executor_modes; template -class Allocator_Slice { -private: - typename Aggregated_Executor::Executor_Slice &executor_reference; - Aggregated_Executor &executor_parent; - -public: - using value_type = T; - Allocator_Slice( - typename Aggregated_Executor::Executor_Slice &executor) - : executor_reference(executor), executor_parent(executor.parent) {} - template - explicit Allocator_Slice( - Allocator_Slice const &) noexcept {} - T *allocate(std::size_t n) { - T *data = executor_reference.template get(n); - return data; - } - void deallocate(T *p, std::size_t n) { - /* executor_reference.template mark_unused(p, n); */ - executor_parent.template mark_unused(p, n); - } - template - inline void construct(T *p, Args... args) noexcept { - // Do nothing here - we reuse the content of the last owner - } - void destroy(T *p) { - // Do nothing here - Contents will be destroyed when the buffer manager is - // destroyed, not before - } -}; -template -constexpr bool -operator==(Allocator_Slice const &, - Allocator_Slice const &) noexcept { - return false; -} -template -constexpr bool -operator!=(Allocator_Slice const &, - Allocator_Slice const &) noexcept { - return true; -} - -namespace hpx { namespace parallel { namespace execution { - // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with - // compatible unlying executor types - /* template */ - /* struct is_one_way_executor::Executor_Slice> */ - /* : std::true_type */ - /* {}; */ - /* template */ - /* struct is_two_way_executor::Executor_Slice> */ - /* : std::true_type */ - /* {}; */ - -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) - // Workaround for the meantime: Manually create traits for compatible types: - template<> - struct is_one_way_executor::Executor_Slice> - : std::true_type - {}; - template<> - struct is_two_way_executor::Executor_Slice> - : std::true_type - {}; -#endif -}}} +using Allocator_Slice + [[deprecated("Use cppuddle::kernel_aggregation::allocator_slice " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::allocator_slice; -//=============================================================================== -//=============================================================================== -// Pool Strategy: +template +using Aggregated_Executor + [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::aggregated_executor; template -class aggregation_pool { -public: - /// interface - template - static void init(size_t number_of_executors, size_t slices_per_executor, - Aggregated_Executor_Modes mode, size_t num_devices = 1) { - if (is_initialized) { - throw std::runtime_error( - std::string("Trying to initialize cppuddle aggregation pool twice") + - " Agg pool name: " + std::string(kernelname)); - } - if (num_devices > recycler::max_number_gpus) { - throw std::runtime_error( - std::string( - "Trying to initialize aggregation with more devices than the " - "maximum number of GPUs given at compiletime") + - " Agg pool name: " + std::string(kernelname)); - } - number_devices = num_devices; - for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { - - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(instance()[gpu_id].aggregation_executor_pool.empty()); - for (int i = 0; i < number_of_executors; i++) { - instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, - mode, gpu_id); - } - instance()[gpu_id].slices_per_executor = slices_per_executor; - instance()[gpu_id].mode = mode; - } - is_initialized = true; - } - - /// Will always return a valid executor slice - static decltype(auto) request_executor_slice(void) { - if (!is_initialized) { - throw std::runtime_error( - std::string("Trying to use cppuddle aggregation pool without first calling init") + - " Agg poolname: " + std::string(kernelname)); - } - const size_t gpu_id = recycler::get_device_id(number_devices); - /* const size_t gpu_id = 1; */ - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(!instance()[gpu_id].aggregation_executor_pool.empty()); - std::optional::Executor_Slice>> - ret; - size_t local_id = (instance()[gpu_id].current_interface) % - instance()[gpu_id].aggregation_executor_pool.size(); - ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - // Expected case: current aggregation executor is free - if (ret.has_value()) { - return ret; - } - // current interface is bad -> find free one - size_t abort_counter = 0; - const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; - do { - local_id = (++(instance()[gpu_id].current_interface)) % // increment interface - instance()[gpu_id].aggregation_executor_pool.size(); - ret = - instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - if (ret.has_value()) { - return ret; - } - abort_counter++; - } while (abort_counter <= abort_number); - // Everything's busy -> create new aggregation executor (growing pool) OR - // return empty optional - if (instance()[gpu_id].growing_pool) { - instance()[gpu_id].aggregation_executor_pool.emplace_back( - instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); - instance()[gpu_id].current_interface = - instance()[gpu_id].aggregation_executor_pool.size() - 1; - assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); - ret = instance()[gpu_id] - .aggregation_executor_pool[instance()[gpu_id].current_interface] - .request_executor_slice(); - assert(ret.has_value()); // fresh executor -- should always have slices - // available - } - return ret; - } - -private: - std::deque> aggregation_executor_pool; - std::atomic current_interface{0}; - size_t slices_per_executor; - Aggregated_Executor_Modes mode; - bool growing_pool{true}; - -private: - /// Required for dealing with adding elements to the deque of - /// aggregated_executors - recycler::aggregation_mutex_t pool_mutex; - /// Global access instance - static std::unique_ptr& instance(void) { - static std::unique_ptr pool_instances{ - new aggregation_pool[recycler::max_number_gpus]}; - return pool_instances; - } - static inline size_t number_devices = 1; - static inline bool is_initialized = false; - aggregation_pool() = default; - -public: - ~aggregation_pool() = default; - // Bunch of constructors we don't need - aggregation_pool(aggregation_pool const &other) = delete; - aggregation_pool &operator=(aggregation_pool const &other) = delete; - aggregation_pool(aggregation_pool &&other) = delete; - aggregation_pool &operator=(aggregation_pool &&other) = delete; -}; +using aggregation_pool + [[deprecated("Use cppuddle::kernel_aggregation::aggregation_pool " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::aggregation_pool; #endif diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index d36a994a..64497a9d 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -1,31 +1,31 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef ALIGNED_BUFFER_UTIL_HPP #define ALIGNED_BUFFER_UTIL_HPP -#include "buffer_manager.hpp" -#include +#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp" namespace recycler { -namespace device_selection { -template -struct select_device_functor< - T, boost::alignment::aligned_allocator> { - void operator()(const size_t device_id) {} -}; -} // namespace device_selection template ::value, int> = 0> -using recycle_aligned = detail::recycle_allocator< - T, boost::alignment::aligned_allocator>; +using recycle_aligned + [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_aligned; + template ::value, int> = 0> -using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< - T, boost::alignment::aligned_allocator>; +using aggressive_recycle_aligned + [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::aggressive_recycle_aligned; + } // namespace recycler #endif diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 92a5f46b..69020e5b 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -1,938 +1,60 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Warn about suboptimal performance without correct HPX-aware allocators -#ifdef CPPUDDLE_HAVE_HPX -#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS -#pragma message \ -"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \ -For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!" -#else -// include runtime to get HPX thread IDs required for the HPX-aware allocators -#include -#endif -#endif - -#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) -// For builds with The HPX mutex -#include -#endif - -#ifdef CPPUDDLE_HAVE_COUNTERS -#include -#if defined(CPPUDDLE_HAVE_HPX) -#include -#endif -#endif - -#include "../include/detail/config.hpp" +#include "cppuddle/common/config.hpp" +#include "cppuddle/memory_recycling/buffer_management_interface.hpp" +#include "cppuddle/memory_recycling/detail/buffer_management.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" namespace recycler { -namespace device_selection { -template struct select_device_functor { - void operator()(const size_t device_id) { - if constexpr (max_number_gpus > 1) - throw std::runtime_error( - "Allocators used in Multi-GPU builds need explicit Multi-GPU support " - "(by having a select_device_functor overload"); - } -}; -template struct select_device_functor> { - void operator()(const size_t device_id) {} -}; -} // namespace device_selection - namespace detail { - - -class buffer_recycler { -public: -#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) - -// Warn about suboptimal performance without recycling -#pragma message \ -"Warning: Building without buffer recycling! Use only for performance testing! \ -For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!" - - template - static T *get(size_t number_elements, bool manage_content_lifetime = false, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { - - return Host_Allocator{}.allocate(number_elements); - } - /// Marks an buffer as unused and fit for reusage - template - static void mark_unused(T *p, size_t number_elements, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { - return Host_Allocator{}.deallocate(p, number_elements); - } -#else - /// Returns and allocated buffer of the requested size - this may be a reused - /// buffer - template - static T *get(size_t number_elements, bool manage_content_lifetime = false, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { - try { - return buffer_manager::get( - number_elements, manage_content_lifetime, location_hint, device_id); - } catch (const std::exception &exc) { - std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl; - std::cerr << "Rethrowing exception... " << std::endl;; - throw; - } - } - /// Marks an buffer as unused and fit for reusage - template - static void mark_unused(T *p, size_t number_elements, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { - try { - return buffer_manager::mark_unused(p, number_elements, - location_hint, device_id); - } catch (const std::exception &exc) { - std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl; - std::cerr << "Rethrowing exception... " << std::endl;; - throw; - } - } -#endif - template - static void register_allocator_counters_with_hpx(void) { -#ifdef CPPUDDLE_HAVE_COUNTERS - buffer_manager::register_counters_with_hpx(); -#else - std::cerr << "Warning: Trying to register allocator performance counters " - "with HPX but CPPuddle was built " - "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" - << std::endl; -#endif - } - - /// Deallocate all buffers, no matter whether they are marked as used or not - static void clean_all() { - std::lock_guard guard(instance().callback_protection_mut); - for (const auto &clean_function : - instance().total_cleanup_callbacks) { - clean_function(); - } - } - /// Deallocated all currently unused buffer - static void clean_unused_buffers() { - std::lock_guard guard(instance().callback_protection_mut); - for (const auto &clean_function : - instance().partial_cleanup_callbacks) { - clean_function(); - } - } - /// Deallocate all buffers, no matter whether they are marked as used or not - static void finalize() { - std::lock_guard guard(instance().callback_protection_mut); - for (const auto &finalize_function : - instance().finalize_callbacks) { - finalize_function(); - } - } - - static void print_performance_counters() { -#ifdef CPPUDDLE_HAVE_COUNTERS - std::lock_guard guard(instance().callback_protection_mut); - for (const auto &print_function : - instance().print_callbacks) { - print_function(); - } -#else - std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built " - "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" - << std::endl; -#endif - } - - // Member variables and methods -private: - - /// Singleton instance access - static buffer_recycler& instance() { - static buffer_recycler singleton{}; - return singleton; - } - /// Callbacks for printing the performance counter data - std::list> print_callbacks; - /// Callbacks for buffer_manager finalize - each callback completely destroys - /// one buffer_manager - std::list> finalize_callbacks; - /// Callbacks for buffer_manager cleanups - each callback destroys all buffers within - /// one buffer_manager, both used and unsued - std::list> total_cleanup_callbacks; - /// Callbacks for partial buffer_manager cleanups - each callback deallocates - /// all unused buffers of a manager - std::list> partial_cleanup_callbacks; - /// default, private constructor - not automatically constructed due to the - /// deleted constructors - buffer_recycler() = default; - - mutex_t callback_protection_mut; - /// Add a callback function that gets executed upon cleanup and destruction - static void add_total_cleanup_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); - instance().total_cleanup_callbacks.push_back(func); - } - /// Add a callback function that gets executed upon partial (unused memory) - /// cleanup - static void add_partial_cleanup_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); - instance().partial_cleanup_callbacks.push_back(func); - } - /// Add a callback function that gets executed upon partial (unused memory) - /// cleanup - static void add_finalize_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); - instance().finalize_callbacks.push_back(func); - } - /// Add a callback function that gets executed upon partial (unused memory) - /// cleanup - static void add_print_callback(const std::function &func) { - std::lock_guard guard(instance().callback_protection_mut); - instance().print_callbacks.push_back(func); - } - -public: - ~buffer_recycler() = default; - - // Subclasses -private: - /// Memory Manager subclass to handle buffers a specific type - template class buffer_manager { - private: - // Tuple content: Pointer to buffer, buffer_size, location ID, Flag - // The flag at the end controls whether to buffer content is to be reused as - // well - using buffer_entry_type = std::tuple; - - - public: - /// Cleanup and delete this singleton - static void clean() { - assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances * max_number_gpus; i++) { - std::lock_guard guard(instance()[i].mut); - instance()[i].clean_all_buffers(); - } - } - static void print_performance_counters() { - assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances * max_number_gpus; i++) { - std::lock_guard guard(instance()[i].mut); - instance()[i].print_counters(); - } - } - static void finalize() { - assert(instance() && !is_finalized); - is_finalized = true; - for (auto i = 0; i < number_instances * max_number_gpus; i++) { - std::lock_guard guard(instance()[i].mut); - instance()[i].clean_all_buffers(); - } - instance().reset(); - } - /// Cleanup all buffers not currently in use - static void clean_unused_buffers_only() { - assert(instance() && !is_finalized); - for (auto i = 0; i < number_instances * max_number_gpus; i++) { - std::lock_guard guard(instance()[i].mut); - for (auto &buffer_tuple : instance()[i].unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - instance()[i].unused_buffer_list.clear(); - } - } -#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX) - static size_t get_sum_number_recycling(bool reset) { - if (reset) - sum_number_recycling = 0; - return sum_number_recycling; - } - static size_t get_sum_number_allocation(bool reset) { - if (reset) - sum_number_allocation = 0; - return sum_number_allocation; - } - static size_t get_sum_number_creation(bool reset) { - if (reset) - sum_number_creation = 0; - return sum_number_creation; - } - static size_t get_sum_number_deallocation(bool reset) { - if (reset) - sum_number_deallocation = 0; - return sum_number_deallocation; - } - static size_t get_sum_number_wrong_hints(bool reset) { - if (reset) - sum_number_wrong_hints = 0; - return sum_number_wrong_hints; - } - static size_t get_sum_number_wrong_device_hints(bool reset) { - if (reset) - sum_number_wrong_hints = 0; - return sum_number_wrong_device_hints; - } - static size_t get_sum_number_bad_allocs(bool reset) { - if (reset) - sum_number_bad_allocs = 0; - return sum_number_bad_allocs; - } - - static void register_counters_with_hpx(void) { - std::string alloc_name = - boost::core::demangle(typeid(Host_Allocator).name()) + - std::string("_") + boost::core::demangle(typeid(T).name()); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"), - &get_sum_number_recycling, - "Number of allocations using a recycled buffer with this " - "allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"), - &get_sum_number_allocation, - "Number of allocations with this allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"), - &get_sum_number_creation, - "Number of allocations not using a recycled buffer with this " - "allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"), - &get_sum_number_deallocation, - "Number of deallocations yielding buffers to be recycled with this " - "allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"), - &get_sum_number_wrong_hints, - "Number of wrong hints supplied to the dealloc method with this allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"), - &get_sum_number_wrong_device_hints, - "Number of wrong device hints supplied to the dealloc method with this allocator"); - hpx::performance_counters::install_counter_type( - std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"), - &get_sum_number_bad_allocs, - "Number of wrong bad allocs which triggered a cleanup of unused buffers"); - } -#endif - - /// Tries to recycle or create a buffer of type T and size number_elements. - static T *get(size_t number_of_elements, bool manage_content_lifetime, - std::optional location_hint = std::nullopt, - std::optional gpu_device_id = std::nullopt) { - init_callbacks_once(); - if (is_finalized) { - throw std::runtime_error("Tried allocation after finalization"); - } - assert(instance() && !is_finalized); - - size_t location_id = 0; - if (location_hint) { - location_id = *location_hint; - } - if (location_id >= number_instances) { - throw std::runtime_error("Tried to create buffer with invalid location_id [get]"); - } - size_t device_id = 0; - if (gpu_device_id) { - device_id = *gpu_device_id; - } - if (device_id >= max_number_gpus) { - throw std::runtime_error("Tried to create buffer with invalid device id [get]! " - "Is multigpu support enabled with the correct number " - "of GPUs?"); - } - - location_id = location_id + device_id * number_instances; - std::lock_guard guard(instance()[location_id].mut); - - -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_allocation++; - sum_number_allocation++; -#endif - // Check for unused buffers we can recycle: - for (auto iter = instance()[location_id].unused_buffer_list.begin(); - iter != instance()[location_id].unused_buffer_list.end(); iter++) { - auto tuple = *iter; - if (std::get<1>(tuple) == number_of_elements) { - instance()[location_id].unused_buffer_list.erase(iter); - - // handle the switch from aggressive to non aggressive reusage (or - // vice-versa) - if (manage_content_lifetime && !std::get<3>(tuple)) { - std::uninitialized_value_construct_n(std::get<0>(tuple), - number_of_elements); - std::get<3>(tuple) = true; - } else if (!manage_content_lifetime && std::get<3>(tuple)) { - std::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); - std::get<3>(tuple) = false; - } - instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple}); -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_recycling++; - sum_number_recycling++; -#endif - return std::get<0>(tuple); - } - } - - // No unused buffer found -> Create new one and return it - try { - recycler::device_selection::select_device_functor{}( - device_id); - Host_Allocator alloc; - T *buffer = alloc.allocate(number_of_elements); - instance()[location_id].buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_creation++; - sum_number_creation++; -#endif - if (manage_content_lifetime) { - std::uninitialized_value_construct_n(buffer, number_of_elements); - } - return buffer; - } catch (std::bad_alloc &e) { - // not enough memory left! Cleanup and attempt again: - std::cerr - << "Not enough memory left. Cleaning up unused buffers now..." - << std::endl; - buffer_recycler::clean_unused_buffers(); - std::cerr << "Buffers cleaned! Try allocation again..." << std::endl; - - // If there still isn't enough memory left, the caller has to handle it - // We've done all we can in here - Host_Allocator alloc; - recycler::device_selection::select_device_functor{}( - device_id); - T *buffer = alloc.allocate(number_of_elements); - instance()[location_id].buffer_map.insert( - {buffer, std::make_tuple(buffer, number_of_elements, 1, - manage_content_lifetime)}); -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_creation++; - sum_number_creation++; - instance()[location_id].number_bad_alloc++; - sum_number_bad_allocs++; -#endif - std::cerr << "Second attempt allocation successful!" << std::endl; - if (manage_content_lifetime) { - std::uninitialized_value_construct_n(buffer, number_of_elements); - } - return buffer; - } - } - - static void mark_unused(T *memory_location, size_t number_of_elements, - std::optional location_hint = std::nullopt, - std::optional device_hint = std::nullopt) { - if (is_finalized) - return; - assert(instance() && !is_finalized); - - size_t location_id = 0; - if (location_hint) { - location_id = *location_hint; - if (location_id >= number_instances) { - throw std::runtime_error( - "Buffer recylcer received invalid location hint [mark_unused]"); - } - } - size_t device_id = 0; - if (device_hint) { - device_id = *device_hint; - if (device_id >= max_number_gpus) { - throw std::runtime_error( - "Buffer recylcer received invalid devce hint [mark_unused]"); - } - } - - // Attempt 1 to find the correct bucket/location: Look at provided hint: - if (location_hint) { - size_t location_id = location_hint.value() + device_id * number_instances; - std::lock_guard guard(instance()[location_id].mut); - if (instance()[location_id].buffer_map.find(memory_location) != - instance()[location_id].buffer_map.end()) { -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_deallocation++; - sum_number_deallocation++; -#endif - auto it = instance()[location_id].buffer_map.find(memory_location); - assert(it != instance()[location_id].buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[location_id].unused_buffer_list.push_front(tuple); - instance()[location_id].buffer_map.erase(memory_location); - return; // Success - } - // hint was wrong -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_wrong_hints++; - sum_number_wrong_hints++; -#endif - } - // Failed to find buffer in the specified localtion/device! - // Attempt 2 - Look for buffer other locations on the same device... - for (size_t location_id = device_id * number_instances; - location_id < (device_id + 1) * number_instances; location_id++) { - if (location_hint) { - if (*location_hint + device_id * max_number_gpus == location_id) { - continue; // already tried this -> skip - } - } - std::lock_guard guard(instance()[location_id].mut); - if (instance()[location_id].buffer_map.find(memory_location) != - instance()[location_id].buffer_map.end()) { -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_deallocation++; - sum_number_deallocation++; -#endif - auto it = instance()[location_id].buffer_map.find(memory_location); - assert(it != instance()[location_id].buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[location_id].unused_buffer_list.push_front(tuple); - instance()[location_id].buffer_map.erase(memory_location); - return; // Success - } - } - // device hint was wrong -#ifdef CPPUDDLE_HAVE_COUNTERS - if (device_hint) { - sum_number_wrong_device_hints++; - } -#endif - // Failed to find buffer on the specified device! - // Attempt 3 - Look for buffer on other devices... - for (size_t local_device_id = 0; local_device_id < max_number_gpus; - local_device_id++) { - if (local_device_id == device_id) - continue; // aldready tried this device - - // Try hint localtion first yet again (though on different device) - if (location_hint) { - size_t location_id = location_hint.value() + local_device_id * number_instances; - std::lock_guard guard(instance()[location_id].mut); - if (instance()[location_id].buffer_map.find(memory_location) != - instance()[location_id].buffer_map.end()) { -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_deallocation++; - sum_number_deallocation++; -#endif - auto it = instance()[location_id].buffer_map.find(memory_location); - assert(it != instance()[location_id].buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[location_id].unused_buffer_list.push_front(tuple); - instance()[location_id].buffer_map.erase(memory_location); - return; // Success - } - } - // Failed - check all other localtions on device - for (size_t location_id = local_device_id * number_instances; - location_id < (local_device_id + 1) * number_instances; location_id++) { - if (location_hint) { - if (*location_hint + local_device_id * max_number_gpus == location_id) { - continue; // already tried this -> skip - } - } - std::lock_guard guard(instance()[location_id].mut); - if (instance()[location_id].buffer_map.find(memory_location) != - instance()[location_id].buffer_map.end()) { -#ifdef CPPUDDLE_HAVE_COUNTERS - instance()[location_id].number_deallocation++; - sum_number_deallocation++; -#endif - auto it = instance()[location_id].buffer_map.find(memory_location); - assert(it != instance()[location_id].buffer_map.end()); - auto &tuple = it->second; - // sanity checks: - assert(std::get<1>(tuple) == number_of_elements); - // move to the unused_buffer list - instance()[location_id].unused_buffer_list.push_front(tuple); - instance()[location_id].buffer_map.erase(memory_location); - return; // Success - } - } - } - // Buffer that is to be deleted is nowhere to be found - we looked everywhere! - // => - // Failure! Handle here... - - // TODO Throw exception instead in the futures, as soon as the recycler finalize is - // in all user codes - /* throw std::runtime_error("Tried to delete non-existing buffer"); */ - - // This is odd: Print warning -- however, might also happen with static - // buffers using these allocators IF the new finalize was not called. For - // now, print warning until all user-code is upgraded to the finalize method. - // This allows using current versions of cppuddle with older application code - std::cerr - << "Warning! Tried to delete non-existing buffer within CPPuddle!" - << std::endl; - std::cerr << "Did you forget to call recycler::finalize?" << std::endl; - } - - private: - /// List with all buffers still in usage - std::unordered_map buffer_map{}; - /// List with all buffers currently not used - std::list unused_buffer_list{}; - /// Access control - mutex_t mut; -#ifdef CPPUDDLE_HAVE_COUNTERS - /// Performance counters - size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0}, - number_recycling{0}, number_creation{0}, number_bad_alloc{0}; - - static inline std::atomic sum_number_allocation{0}, - sum_number_deallocation{0}, sum_number_wrong_hints{0}, - sum_number_wrong_device_hints{0}, sum_number_recycling{0}, - sum_number_creation{0}, sum_number_bad_allocs{0}; -#endif - /// default, private constructor - not automatically constructed due to - /// the deleted constructors - buffer_manager() = default; - buffer_manager& - operator=(buffer_manager const &other) = default; - buffer_manager& - operator=(buffer_manager &&other) = delete; - static std::unique_ptr& instance(void) { - static std::unique_ptr instances{ - new buffer_manager[number_instances * max_number_gpus]}; - return instances; - } - static void init_callbacks_once(void) { - assert(instance()); -#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) - static hpx::once_flag flag; - hpx::call_once(flag, []() { -#else - static std::once_flag flag; - std::call_once(flag, []() { -#endif - is_finalized = false; - buffer_recycler::add_total_cleanup_callback(clean); - buffer_recycler::add_partial_cleanup_callback( - clean_unused_buffers_only); - buffer_recycler::add_finalize_callback( - finalize); -#ifdef CPPUDDLE_HAVE_COUNTERS - buffer_recycler::add_print_callback( - print_performance_counters); -#endif - }); - } - static inline std::atomic is_finalized; - -#ifdef CPPUDDLE_HAVE_COUNTERS - void print_counters(void) { - if (number_allocation == 0) - return; - // Print performance counters - size_t number_cleaned = unused_buffer_list.size() + buffer_map.size(); - std::cout << "\nBuffer manager destructor for (Alloc: " - << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: " - << boost::core::demangle(typeid(T).name()) - << "):" << std::endl - << "--------------------------------------------------------------------" - << std::endl - << "--> Number of bad_allocs that triggered garbage " - "collection: " - << number_bad_alloc << std::endl - << "--> Number of buffers that got requested from this " - "manager: " - << number_allocation << std::endl - << "--> Number of times an unused buffer got recycled for a " - "request: " - << number_recycling << std::endl - << "--> Number of times a new buffer had to be created for a " - "request: " - << number_creation << std::endl - << "--> Number cleaned up buffers: " - " " - << number_cleaned << std::endl - << "--> Number wrong deallocation hints: " - " " - << number_wrong_hints << std::endl - << "--> Number of buffers that were marked as used upon " - "cleanup: " - << buffer_map.size() << std::endl - << "==> Recycle rate: " - " " - << static_cast(number_recycling) / number_allocation * - 100.0f - << "%" << std::endl; - } -#endif - - void clean_all_buffers(void) { -#ifdef CPPUDDLE_HAVE_COUNTERS - if (number_allocation == 0 && number_recycling == 0 && - number_bad_alloc == 0 && number_creation == 0 && - unused_buffer_list.empty() && buffer_map.empty()) { - return; - } -#endif - for (auto &buffer_tuple : unused_buffer_list) { - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - for (auto &map_tuple : buffer_map) { - auto buffer_tuple = map_tuple.second; - Host_Allocator alloc; - if (std::get<3>(buffer_tuple)) { - std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); - } - unused_buffer_list.clear(); - buffer_map.clear(); -#ifdef CPPUDDLE_HAVE_COUNTERS - number_allocation = 0; - number_recycling = 0; - number_bad_alloc = 0; - number_creation = 0; - number_wrong_hints = 0; -#endif - } - public: - ~buffer_manager() { - clean_all_buffers(); - } - - public: // Putting deleted constructors in public gives more useful error - // messages - // Bunch of constructors we don't need - buffer_manager( - buffer_manager const &other) = delete; - buffer_manager( - buffer_manager &&other) = delete; - }; - -public: - // Putting deleted constructors in public gives more useful error messages - // Bunch of constructors we don't need - buffer_recycler(buffer_recycler const &other) = delete; - buffer_recycler& operator=(buffer_recycler const &other) = delete; - buffer_recycler(buffer_recycler &&other) = delete; - buffer_recycler& operator=(buffer_recycler &&other) = delete; -}; - -template struct recycle_allocator { - using value_type = T; - using underlying_allocator_type = Host_Allocator; - static_assert(std::is_same_v); - const std::optional dealloc_hint; - const std::optional device_id; - -#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - recycle_allocator() noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - explicit recycle_allocator(size_t hint) noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - explicit recycle_allocator( - recycle_allocator const &other) noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get(n); - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); - } -#else - recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} - explicit recycle_allocator(const size_t device_id) noexcept - : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} - explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept - : dealloc_hint(location_id), device_id(device_id) {} - explicit recycle_allocator( - recycle_allocator const &other) noexcept - : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get( - n, false, hpx::get_worker_thread_num() % number_instances, device_id); - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint, - device_id); - } -#endif - - template - inline void construct(T *p, Args... args) noexcept { - ::new (static_cast(p)) T(std::forward(args)...); - } - void destroy(T *p) { p->~T(); } -}; -template -constexpr bool -operator==(recycle_allocator const &, - recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return true; - else - return false; -} -template -constexpr bool -operator!=(recycle_allocator const &, - recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return false; - else - return true; -} - -/// Recycles not only allocations but also the contents of a buffer -template -struct aggressive_recycle_allocator { - using value_type = T; - using underlying_allocator_type = Host_Allocator; - static_assert(std::is_same_v); - const std::optional dealloc_hint; - const std::optional device_id; - -#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - aggressive_recycle_allocator() noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - explicit aggressive_recycle_allocator(size_t hint) noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - explicit aggressive_recycle_allocator( - aggressive_recycle_allocator const &) noexcept - : dealloc_hint(std::nullopt), device_id(std::nullopt) {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get( - n, true); // also initializes the buffer if it isn't reused - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); - } -#else - aggressive_recycle_allocator() noexcept - : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} - explicit aggressive_recycle_allocator(const size_t device_id) noexcept - : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} - explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept - : dealloc_hint(location_id), device_id(device_id) {} - explicit aggressive_recycle_allocator( - recycle_allocator const &other) noexcept - : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} - T *allocate(std::size_t n) { - T *data = buffer_recycler::get( - n, true, dealloc_hint, device_id); // also initializes the buffer - // if it isn't reused - return data; - } - void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint, - device_id); - } -#endif - -#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS - template - inline void construct(T *p, Args... args) noexcept { - // Do nothing here - we reuse the content of the last owner - } - void destroy(T *p) { - // Do nothing here - Contents will be destroyed when the buffer manager is - // destroyed, not before - } -#else -// Warn about suboptimal performance without recycling -#pragma message \ -"Warning: Building without content reusage for aggressive allocators! \ -For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !" - template - inline void construct(T *p, Args... args) noexcept { - ::new (static_cast(p)) T(std::forward(args)...); - } - void destroy(T *p) { p->~T(); } -#endif -}; - -template -constexpr bool -operator==(aggressive_recycle_allocator const &, - aggressive_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return true; - else - return false; +using buffer_recycler [[deprecated( + "Use buffer_interface from header " + "cppuddle/memory_recycling/detail/buffer_management.hpp instead")]] = + cppuddle::memory_recycling::detail::buffer_interface; } -template -constexpr bool -operator!=(aggressive_recycle_allocator const &, - aggressive_recycle_allocator const &) noexcept { - if constexpr (std::is_same_v) - return false; - else - return true; -} - -} // namespace detail template ::value, int> = 0> -using recycle_std = detail::recycle_allocator>; +using recycle_std + [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_std; + template ::value, int> = 0> -using aggressive_recycle_std = - detail::aggressive_recycle_allocator>; +using aggressive_recycle_std + [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::aggressive_recycle_std; -inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); } +[[deprecated("Use cppuddle::memory_recycling::print_buffer_counters() instead")]] +inline void print_performance_counters() { + cppuddle::memory_recycling::print_buffer_counters(); +} /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself -inline void force_cleanup() { detail::buffer_recycler::clean_all(); } +[[deprecated("Use cppuddle::memory_recycling::force_buffer_cleanup() instead")]] +inline void force_cleanup() { cppuddle::memory_recycling::force_buffer_cleanup(); } /// Deletes all buffers currently marked as unused -inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); } +[[deprecated("Use cppuddle::memory_recycling::unused_buffer_cleanup() instead")]] +inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself. Disallows further usage. -inline void finalize() { detail::buffer_recycler::finalize(); } +[[deprecated("Use cppuddle::memory_recycling::finalize() instead")]] +inline void finalize() { cppuddle::memory_recycling::finalize(); } + +[[deprecated("Use cppuddle::max_number_gpus instead")]] constexpr auto max_number_gpus = + cppuddle::max_number_gpus; +[[deprecated("Use cppuddle::number_instances instead")]] constexpr auto number_instances = + cppuddle::number_instances; -} // end namespace recycler +} // namespace recycler #endif diff --git a/include/detail/config.hpp b/include/cppuddle/common/config.hpp similarity index 95% rename from include/detail/config.hpp rename to include/cppuddle/common/config.hpp index 2a06b1af..c9a5f736 100644 --- a/include/detail/config.hpp +++ b/include/cppuddle/common/config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2023-2023 Gregor Daiß +// Copyright (c) 2023-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -28,7 +28,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #endif #endif -namespace recycler { +namespace cppuddle { #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) using mutex_t = hpx::spinlock_no_backoff; @@ -67,6 +67,6 @@ inline size_t get_device_id(const size_t number_gpus) { #endif } -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp new file mode 100644 index 00000000..6a89025b --- /dev/null +++ b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp @@ -0,0 +1,421 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef EXECUTOR_POOLS_MANAGEMENT_HPP +#define EXECUTOR_POOLS_MANAGEMENT_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cppuddle/common/config.hpp" + +// Need to cuda/hip definitions for default params when NOT +// drawing from an executor pool +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +#include +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) +#include +#endif +#endif + +// Redefintion required for non-recycling executors +// Without it, default constructing the executors (independent) would not work +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +// Do only define if Kokkos is not found +#ifndef KOKKOS_ENABLE_SERIAL +namespace hpx { namespace kokkos { +enum class execution_space_mode { global, independent }; +}} +#endif +#endif + +namespace cppuddle { +namespace executor_recycling { +namespace detail { + +/// Turns a std::array_mutex into an scoped lock +template +auto make_scoped_lock_from_array(mutex_array_t& mutexes) +{ + return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, + mutexes); +} + +template class round_robin_pool_impl { +private: + std::deque pool{}; + std::vector ref_counters{}; + size_t current_interface{0}; + +public: + template + round_robin_pool_impl(size_t number_of_executors, Ts... executor_args) { + ref_counters.reserve(number_of_executors); + for (int i = 0; i < number_of_executors; i++) { + pool.emplace_back(executor_args...); + ref_counters.emplace_back(0); + } + } + // return a tuple with the interface and its index (to release it later) + std::tuple get_interface() { + assert(!(pool.empty())); + size_t last_interface = current_interface; + current_interface = (current_interface + 1) % pool.size(); + ref_counters[last_interface]++; + std::tuple ret(pool[last_interface], last_interface); + return ret; + } + void release_interface(size_t index) { ref_counters[index]--; } + bool interface_available(size_t load_limit) { + return *(std::min_element(std::begin(ref_counters), + std::end(ref_counters))) < load_limit; + } + size_t get_current_load() { + return *( + std::min_element(std::begin(ref_counters), std::end(ref_counters))); + } + // TODO Remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ +}; + +template class priority_pool_impl { +private: + std::deque pool{}; + std::vector ref_counters{}; // Ref counters + std::vector priorities{}; // Ref counters +public: + template + priority_pool_impl(size_t number_of_executors, Ts... executor_args) { + ref_counters.reserve(number_of_executors); + priorities.reserve(number_of_executors); + for (auto i = 0; i < number_of_executors; i++) { + pool.emplace_back(executor_args...); + ref_counters.emplace_back(0); + priorities.emplace_back(i); + } + } + // return a tuple with the interface and its index (to release it later) + std::tuple get_interface() { + auto &interface = pool[priorities[0]]; + ref_counters[priorities[0]]++; + std::tuple ret(interface, priorities[0]); + std::make_heap(std::begin(priorities), std::end(priorities), + [this](const size_t &first, const size_t &second) -> bool { + return ref_counters[first] > ref_counters[second]; + }); + return ret; + } + void release_interface(size_t index) { + ref_counters[index]--; + std::make_heap(std::begin(priorities), std::end(priorities), + [this](const size_t &first, const size_t &second) -> bool { + return ref_counters[first] > ref_counters[second]; + }); + } + bool interface_available(size_t load_limit) { + return ref_counters[priorities[0]] < load_limit; + } + size_t get_current_load() { return ref_counters[priorities[0]]; } + // TODO remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ +}; + +/// Access/Concurrency Control for executor pool implementation +class executor_pool { +public: + template + static void init(size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init(number_of_executors, + executor_args...); + } + template + static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init_all_executor_pools(number_of_executors, + executor_args...); + } + template + static void init_executor_pool(size_t pool_id, size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init_executor_pool(pool_id, number_of_executors, + executor_args...); + } + template static void cleanup() { + executor_pool_implementation::cleanup(); + } + template + static std::tuple get_interface(const size_t gpu_id) { + return executor_pool_implementation::get_interface(gpu_id); + } + template + static void release_interface(size_t index, const size_t gpu_id) noexcept { + executor_pool_implementation::release_interface(index, + gpu_id); + } + template + static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept { + return executor_pool_implementation::interface_available( + load_limit, gpu_id); + } + template + static size_t get_current_load(const size_t gpu_id = 0) noexcept { + return executor_pool_implementation::get_current_load( + gpu_id); + } + template + static size_t get_next_device_id(const size_t number_gpus) noexcept { + // TODO add round robin and min strategy + return cppuddle::get_device_id(number_gpus); + } + + template + static void set_device_selector(std::function select_gpu_function) { + executor_pool_implementation::set_device_selector(select_gpu_function); + } + + template + static void select_device(size_t gpu_id) { + executor_pool_implementation::select_device(gpu_id); + } + +private: + executor_pool() = default; + +private: + template class executor_pool_implementation { + public: + /// Deprecated! Use init_on_all_gpu or init_on_gpu + template + static void init(size_t number_of_executors, Ts ... executor_args) { + /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ + /* "deprecated executor_pool::init does not support multigpu"); */ + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + instance().executorpools.emplace_back(number_of_executors, executor_args...); + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments + template + static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_executors > 0) { + for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { + instance().select_gpu_function(gpu_id); + instance().executorpools.emplace_back(number_of_executors, + executor_args...); + } + } + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + /// Per-GPU init allowing for different init parameters depending on the GPU + /// (useful for executor that expect an GPU-id during construction) + template + static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_executors > 0) { + instance().select_gpu_function(gpu_id); + instance().executorpools.emplace_back(number_of_executors, + executor_args...); + } + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + // TODO add/rename into finalize? + static void cleanup() { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + assert(instance().executorpools.size() == cppuddle::max_number_gpus); + instance().executorpools.clear(); + } + + static std::tuple get_interface(const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].get_interface(); + } + static void release_interface(size_t index, const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + instance().executorpools[gpu_id].release_interface(index); + } + static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].interface_available(load_limit); + } + static size_t get_current_load(const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].get_current_load(); + } + // TODO deprecated! Remove... + /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ + /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ + /* assert(instance().executorpools.size() == cppuddle::max_number_gpus); */ + /* return instance().executorpools[gpu_id].get_next_device_id(); */ + /* } */ + + static void set_device_selector(std::function select_gpu_function) { + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + instance().select_gpu_function = select_gpu_function; + } + + static void select_device(size_t gpu_id) { + instance().select_gpu_function(gpu_id); + } + + private: + executor_pool_implementation() = default; + cppuddle::mutex_t pool_mut{}; + std::function select_gpu_function = [](size_t gpu_id) { + // By default no multi gpu support + assert(cppuddle::max_number_gpus == 1 || instance().executorpools.size() == 1); + assert(gpu_id == 0); + }; + + std::deque executorpools{}; + std::array gpu_mutexes; + + static executor_pool_implementation& instance(void) { + static executor_pool_implementation pool_instance{}; + return pool_instance; + } + + public: + ~executor_pool_implementation() = default; + // Bunch of constructors we don't need + executor_pool_implementation(executor_pool_implementation const &other) = + delete; + executor_pool_implementation & + operator=(executor_pool_implementation const &other) = delete; + executor_pool_implementation(executor_pool_implementation &&other) = delete; + executor_pool_implementation & + operator=(executor_pool_implementation &&other) = delete; + }; + +public: + ~executor_pool() = default; + // Bunch of constructors we don't need + executor_pool(executor_pool const &other) = delete; + executor_pool &operator=(executor_pool const &other) = delete; + executor_pool(executor_pool &&other) = delete; + executor_pool &operator=(executor_pool &&other) = delete; +}; + +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) + +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without executor recycling! Use only for performance testing! \ +For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!" + +/// Slow version of the executor_interface that does not draw its +/// executors (Interface) from the pool but creates them instead. +/// Only meant for performance comparisons and only works with cuda/kokkos executors +template class executor_interface { +public: + + template + explicit executor_interface(size_t gpu_id, + std::enable_if_t::value, size_t> = 0) + : gpu_id(gpu_id), interface(gpu_id) {} + template + explicit executor_interface(std::enable_if_t::value, size_t> = 0) + : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {} + + executor_interface(const executor_interface &other) = delete; + executor_interface &operator=(const executor_interface &other) = delete; + executor_interface(executor_interface &&other) = delete; + executor_interface &operator=(executor_interface &&other) = delete; + ~executor_interface() { + } + + template + inline decltype(auto) post(F &&f, Ts &&... ts) { + return interface.post(std::forward(f), std::forward(ts)...); + } + + template + inline decltype(auto) async_execute(F &&f, Ts &&... ts) { + return interface.async_execute(std::forward(f), std::forward(ts)...); + } + + inline decltype(auto) get_future() { + return interface.get_future(); + } + + // allow implict conversion + operator Interface &() { // NOLINT + return interface; + } + +private: + size_t gpu_id; + +public: + Interface interface; +}; +#else +/// Stream interface for RAII purposes +/// Draws executor from the executor pool and releases it upon +/// destruction +template class executor_interface { +public: + explicit executor_interface(size_t gpu_id) + : t(executor_pool::get_interface(gpu_id)), + interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} + + executor_interface(const executor_interface &other) = delete; + executor_interface &operator=(const executor_interface &other) = delete; + executor_interface(executor_interface &&other) = delete; + executor_interface &operator=(executor_interface &&other) = delete; + ~executor_interface() { + executor_pool::release_interface(interface_index, gpu_id); + } + + template + inline decltype(auto) post(F &&f, Ts &&... ts) { + return interface.post(std::forward(f), std::forward(ts)...); + } + + template + inline decltype(auto) async_execute(F &&f, Ts &&... ts) { + return interface.async_execute(std::forward(f), std::forward(ts)...); + } + + inline decltype(auto) get_future() { + return interface.get_future(); + } + + // allow implict conversion + operator Interface &() { // NOLINT + return interface; + } + +private: + std::tuple t; + size_t interface_index; + size_t gpu_id; + +public: + Interface &interface; +}; +#endif + +} // namespace detail +} // namespace executor_recycling +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp new file mode 100644 index 00000000..49a6d42d --- /dev/null +++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp @@ -0,0 +1,32 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef EXECUTOR_POOLS_INTERFACE_HPP +#define EXECUTOR_POOLS_INTERFACE_HPP + +#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp" + +namespace cppuddle { +namespace executor_recycling { + +template +using round_robin_pool_impl = + detail::round_robin_pool_impl; + +template +using priority_pool_impl = + detail::priority_pool_impl; + +using executor_pool = + detail::executor_pool; + +template +using executor_interface = + detail::executor_interface; + +} // end namespace executor_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp new file mode 100644 index 00000000..b9d456cc --- /dev/null +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp @@ -0,0 +1,134 @@ +// Copyright (c) 2022-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp" + +#ifndef AGGREGATION_EXECUTOR_POOL_HPP +#define AGGREGATION_EXECUTOR_POOL_HPP + +namespace cppuddle { +namespace kernel_aggregation { +namespace detail { + +template +class aggregation_pool { +public: + /// interface + template + static void init(size_t number_of_executors, size_t slices_per_executor, + aggregated_executor_modes mode, size_t num_devices = 1) { + if (is_initialized) { + throw std::runtime_error( + std::string("Trying to initialize cppuddle aggregation pool twice") + + " Agg pool name: " + std::string(kernelname)); + } + if (num_devices > cppuddle::max_number_gpus) { + throw std::runtime_error( + std::string( + "Trying to initialize aggregation with more devices than the " + "maximum number of GPUs given at compiletime") + + " Agg pool name: " + std::string(kernelname)); + } + number_devices = num_devices; + for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { + + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(instance()[gpu_id].aggregation_executor_pool.empty()); + for (int i = 0; i < number_of_executors; i++) { + instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, + mode, gpu_id); + } + instance()[gpu_id].slices_per_executor = slices_per_executor; + instance()[gpu_id].mode = mode; + } + is_initialized = true; + } + + /// Will always return a valid executor slice + static decltype(auto) request_executor_slice(void) { + if (!is_initialized) { + throw std::runtime_error( + std::string("Trying to use cppuddle aggregation pool without first calling init") + + " Agg poolname: " + std::string(kernelname)); + } + const size_t gpu_id = cppuddle::get_device_id(number_devices); + /* const size_t gpu_id = 1; */ + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(!instance()[gpu_id].aggregation_executor_pool.empty()); + std::optional::executor_slice>> + ret; + size_t local_id = (instance()[gpu_id].current_interface) % + instance()[gpu_id].aggregation_executor_pool.size(); + ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + // Expected case: current aggregation executor is free + if (ret.has_value()) { + return ret; + } + // current interface is bad -> find free one + size_t abort_counter = 0; + const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; + do { + local_id = (++(instance()[gpu_id].current_interface)) % // increment interface + instance()[gpu_id].aggregation_executor_pool.size(); + ret = + instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + if (ret.has_value()) { + return ret; + } + abort_counter++; + } while (abort_counter <= abort_number); + // Everything's busy -> create new aggregation executor (growing pool) OR + // return empty optional + if (instance()[gpu_id].growing_pool) { + instance()[gpu_id].aggregation_executor_pool.emplace_back( + instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); + instance()[gpu_id].current_interface = + instance()[gpu_id].aggregation_executor_pool.size() - 1; + assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); + ret = instance()[gpu_id] + .aggregation_executor_pool[instance()[gpu_id].current_interface] + .request_executor_slice(); + assert(ret.has_value()); // fresh executor -- should always have slices + // available + } + return ret; + } + +private: + std::deque> aggregation_executor_pool; + std::atomic current_interface{0}; + size_t slices_per_executor; + aggregated_executor_modes mode; + bool growing_pool{true}; + +private: + /// Required for dealing with adding elements to the deque of + /// aggregated_executors + aggregation_mutex_t pool_mutex; + /// Global access instance + static std::unique_ptr& instance(void) { + static std::unique_ptr pool_instances{ + new aggregation_pool[cppuddle::max_number_gpus]}; + return pool_instances; + } + static inline size_t number_devices = 1; + static inline bool is_initialized = false; + aggregation_pool() = default; + +public: + ~aggregation_pool() = default; + // Bunch of constructors we don't need + aggregation_pool(aggregation_pool const &other) = delete; + aggregation_pool &operator=(aggregation_pool const &other) = delete; + aggregation_pool(aggregation_pool &&other) = delete; + aggregation_pool &operator=(aggregation_pool &&other) = delete; +}; + +} // namespace detail +} // namespace kernel_aggregation +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp new file mode 100644 index 00000000..dfc76622 --- /dev/null +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp @@ -0,0 +1,1061 @@ +// Copyright (c) 2022-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP +#define AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP + +#ifndef CPPUDDLE_HAVE_HPX +#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON" +#endif + +#include +// When defined, CPPuddle will run more checks +// about the order of aggregated method calls. +// Best defined before including this header when needed +// (hence commented out here) +//#define DEBUG_AGGREGATION_CALLS 1 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) +// required for defining type traits using cuda executor as underlying +// aggregation executors +#include +#endif + +#include +#include + +#include "cppuddle/common/config.hpp" +// get direct access to the buffer manangment +#include "cppuddle/memory_recycling/detail/buffer_management.hpp" +// get normal access to the executor pools +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" + +#ifndef CPPUDDLE_HAVE_HPX_MUTEX +#pragma message \ + "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF" +#pragma message \ + "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex" +#endif +namespace cppuddle { +namespace kernel_aggregation { +namespace detail { + using aggregation_mutex_t = hpx::mutex; + +//=============================================================================== +//=============================================================================== +// Helper functions/classes + +/// Constructs a tuple with copies (to store temporaries in aggregated function +/// calls) yet also supporting references (on the users own risk...) +template +std::tuple make_tuple_supporting_references(Ts &&...ts) { + return std::tuple{std::forward(ts)...}; +} + +/// Print some specific values that we can, but don't bother for most types +/// (such as vector) +template std::string print_if_possible(T val) { + if constexpr (std::is_convertible_v) { + return val; + } else if constexpr (std::is_integral_v || std::is_floating_point_v) { + return std::to_string(val); + } else if constexpr (std::is_pointer_v) { + // Pretty printing pointer sort of only works well with %p + // TODO Try using std::format as soon as we can move to C++20 + std::unique_ptr debug_string(new char[128]()); + snprintf(debug_string.get(), 128, "%p", val); + return std::string(debug_string.get()); + } else { + return std::string("cannot print value"); + } +} + +/// Helper class for the helper class that prints tuples -- do not use this +/// directly +template +void print_tuple(const TupType &_tup, std::index_sequence) { + (..., (hpx::cout << (I == 0 ? "" : ", ") + << print_if_possible(std::get(_tup)))); +} + +/// Helper class for printing tuples (first component should be a function +/// pointer, remaining components the function arguments) +template void print_tuple(const std::tuple &_tup) { + // Use pointer and sprintf as boost::format refused to NOT cast the pointer + // address to 1... + // TODO Try using std::format as soon as we can move to C++20 + std::unique_ptr debug_string(new char[128]()); + snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (", + std::get<0>(_tup)); + hpx::cout << debug_string.get(); + print_tuple(_tup, std::make_index_sequence()); + hpx::cout << ")"; +} + +//=============================================================================== +//=============================================================================== +template +void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) { + hpx::apply(exec, std::forward(f), std::forward(ts)...); +} + +template +hpx::lcos::future exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) { + return hpx::async(exec, std::forward(f), std::forward(ts)...); +} + +/// Manages the launch conditions for aggregated function calls +/// type/value-errors +/** Launch conditions: All slice executors must have called the same function + * (tracked by future all_slices_ready) + * AND + * Previous aggregated_function_call on the same Executor must have been + * launched (tracked by future stream_future) + * All function calls received from the slice executors are checked if they + * match the first one in both types and values (throws exception otherwise) + */ + +template class aggregated_function_call { +private: + std::atomic slice_counter = 0; + + /// Promise to be set when all slices have visited this function call + /* hpx::lcos::local::promise slices_ready_promise; */ + /// Tracks if all slices have visited this function call + /* hpx::lcos::future all_slices_ready = slices_ready_promise.get_future(); */ + /// How many slices can we expect? + const size_t number_slices; + const bool async_mode; + + Executor &underlying_executor; + +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) +#pragma message \ + "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..." + /// Stores the function call of the first slice as reference for error + /// checking + std::any function_tuple; + /// Stores the string of the first function call for debug output + std::string debug_type_information; + aggregation_mutex_t debug_mut; +#endif + + std::vector> potential_async_promises{}; + +public: + aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec) + : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) { + if (async_mode) + potential_async_promises.resize(number_slices); + } + ~aggregated_function_call(void) { + // All slices should have done this call + assert(slice_counter == number_slices); + // assert(!all_slices_ready.valid()); + } + /// Returns true if all required slices have visited this point + bool sync_aggregation_slices(hpx::lcos::future &stream_future) { + assert(!async_mode); + assert(potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + if (local_counter == number_slices - 1) { + return true; + } + else return false; + } + template + void post_when(hpx::lcos::future &stream_future, F &&f, Ts &&...ts) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + // needed for concurrent access to function_tuple and debug_type_information + // Not required for normal use + std::lock_guard guard(debug_mut); +#endif + assert(!async_mode); + assert(potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + + if (local_counter == 0) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto tmp_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + function_tuple = tmp_tuple; + debug_type_information = typeid(decltype(tmp_tuple)).name(); +#endif + + } else { + // + // This scope checks if both the type and the values of the current call + // match the original call To be used in debug build... + // +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto comparison_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + try { + auto orig_call_tuple = + std::any_cast(function_tuple); + if (comparison_tuple != orig_call_tuple) { + throw std::runtime_error( + "Values of post function arguments (or function " + "itself) do not match "); + } + } catch (const std::bad_any_cast &e) { + hpx::cout + << "\nMismatched types error in aggregated post call of executor " + << ": " << e.what() << "\n"; + hpx::cout << "Expected types:\t\t " + << boost::core::demangle(debug_type_information.c_str()); + hpx::cout << "\nGot types:\t\t " + << boost::core::demangle( + typeid(decltype(comparison_tuple)).name()) + << "\n" + << std::endl; + // throw; + } catch (const std::runtime_error &e) { + hpx::cout + << "\nMismatched values error in aggregated post call of executor " + << ": " << e.what() << std::endl; + hpx::cout << "Types (matched):\t " + << boost::core::demangle(debug_type_information.c_str()); + auto orig_call_tuple = + std::any_cast(function_tuple); + hpx::cout << "\nExpected values:\t "; + print_tuple(orig_call_tuple); + hpx::cout << "\nGot values:\t\t "; + print_tuple(comparison_tuple); + hpx::cout << std::endl << std::endl; + // throw; + } +#endif + } + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + // Check exit criteria: Launch function call continuation by setting the + // slices promise + if (local_counter == number_slices - 1) { + exec_post_wrapper(underlying_executor, std::forward(f), std::forward(ts)...); + //slices_ready_promise.set_value(); + } + } + template + hpx::lcos::future async_when(hpx::lcos::future &stream_future, + F &&f, Ts &&...ts) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + // needed for concurrent access to function_tuple and debug_type_information + // Not required for normal use + std::lock_guard guard(debug_mut); +#endif + assert(async_mode); + assert(!potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + if (local_counter == 0) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto tmp_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + function_tuple = tmp_tuple; + debug_type_information = typeid(decltype(tmp_tuple)).name(); +#endif + } else { + // + // This scope checks if both the type and the values of the current call + // match the original call To be used in debug build... + // +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto comparison_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + try { + auto orig_call_tuple = + std::any_cast(function_tuple); + if (comparison_tuple != orig_call_tuple) { + throw std::runtime_error( + "Values of async function arguments (or function " + "itself) do not match "); + } + } catch (const std::bad_any_cast &e) { + hpx::cout + << "\nMismatched types error in aggregated async call of executor " + << ": " << e.what() << "\n"; + hpx::cout << "Expected types:\t\t " + << boost::core::demangle(debug_type_information.c_str()); + hpx::cout << "\nGot types:\t\t " + << boost::core::demangle( + typeid(decltype(comparison_tuple)).name()) + << "\n" + << std::endl; + // throw; + } catch (const std::runtime_error &e) { + hpx::cout + << "\nMismatched values error in aggregated async call of executor " + << ": " << e.what() << std::endl; + hpx::cout << "Types (matched):\t " + << boost::core::demangle(debug_type_information.c_str()); + auto orig_call_tuple = + std::any_cast(function_tuple); + hpx::cout << "\nExpected values:\t "; + print_tuple(orig_call_tuple); + hpx::cout << "\nGot values:\t\t "; + print_tuple(comparison_tuple); + hpx::cout << std::endl << std::endl; + // throw; + } +#endif + } + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + assert(potential_async_promises.size() == number_slices); + hpx::lcos::future ret_fut = + potential_async_promises[local_counter].get_future(); + if (local_counter == number_slices - 1) { + /* slices_ready_promise.set_value(); */ + auto fut = exec_async_wrapper( + underlying_executor, std::forward(f), std::forward(ts)...); + fut.then([this](auto &&fut) { + for (auto &promise : potential_async_promises) { + promise.set_value(); + } + }); + } + // Check exit criteria: Launch function call continuation by setting the + // slices promise + return ret_fut; + } + template + hpx::lcos::shared_future wrap_async(hpx::lcos::future &stream_future, + F &&f, Ts &&...ts) { + assert(async_mode); + assert(!potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + assert(potential_async_promises.size() == number_slices); + hpx::lcos::shared_future ret_fut = + potential_async_promises[local_counter].get_shared_future(); + if (local_counter == number_slices - 1) { + auto fut = f(std::forward(ts)...); + fut.then([this](auto &&fut) { + // TODO just use one promise + for (auto &promise : potential_async_promises) { + promise.set_value(); + } + }); + } + return ret_fut; + } + // We need to be able to copy or no-except move for std::vector.. + aggregated_function_call(const aggregated_function_call &other) = default; + aggregated_function_call & + operator=(const aggregated_function_call &other) = default; + aggregated_function_call(aggregated_function_call &&other) = default; + aggregated_function_call & + operator=(aggregated_function_call &&other) = default; +}; + +//=============================================================================== +//=============================================================================== + +enum class aggregated_executor_modes { EAGER = 1, STRICT, ENDLESS }; +/// Declaration since the actual allocator is only defined after the Executors +template +class allocator_slice; + +/// Executor Class that aggregates function calls for specific kernels +/** Executor is not meant to be used directly. Instead it yields multiple + * executor_slice objects. These serve as interfaces. Slices from the same + * aggregated_executor are meant to execute the same function calls but on + * different data (i.e. different tasks) + */ +template class aggregated_executor { +private: + //=============================================================================== + // Misc private avariables: + // + std::atomic slices_exhausted; + + std::atomic executor_slices_alive; + std::atomic buffers_in_use; + std::atomic dealloc_counter; + + const aggregated_executor_modes mode; + const size_t max_slices; + std::atomic current_slices; + /// Wrapper to the executor interface from the stream pool + /// Automatically hooks into the stream_pools reference counting + /// for cpu/gpu load balancing + std::unique_ptr>> + executor_wrapper; + +public: + size_t gpu_id; + // Subclasses + + /// Slice class - meant as a scope interface to the aggregated executor + class executor_slice { + public: + aggregated_executor &parent; + private: + /// Executor is a slice of this aggregated_executor + /// How many functions have been called - required to enforce sequential + /// behaviour of kernel launches + size_t launch_counter{0}; + size_t buffer_counter{0}; + bool notify_parent_about_destruction{true}; + + public: + /// How many slices are there overall - required to check the launch + /// criteria + const size_t number_slices; + const size_t id; + using executor_t = Executor; + executor_slice(aggregated_executor &parent, const size_t slice_id, + const size_t number_slices) + : parent(parent), notify_parent_about_destruction(true), + number_slices(number_slices), id(slice_id) { + } + ~executor_slice(void) { + // Don't notify parent if we moved away from this executor_slice + if (notify_parent_about_destruction) { + // Executor should be done by the time of destruction + // -> check here before notifying parent + + // parent still in execution mode? + assert(parent.slices_exhausted == true); + // all kernel launches done? + assert(launch_counter == parent.function_calls.size()); + // Notifiy parent that this aggregation slice is one + parent.reduce_usage_counter(); + } + } + executor_slice(const executor_slice &other) = delete; + executor_slice &operator=(const executor_slice &other) = delete; + executor_slice(executor_slice &&other) + : parent(other.parent), launch_counter(std::move(other.launch_counter)), + buffer_counter(std::move(other.buffer_counter)), + number_slices(std::move(other.number_slices)), + id(std::move(other.id)) { + other.notify_parent_about_destruction = false; + } + executor_slice &operator=(executor_slice &&other) { + parent = other.parent; + launch_counter = std::move(other.launch_counter); + buffer_counter = std::move(other.buffer_counter); + number_slices = std::move(other.number_slices); + id = std::move(other.id); + other.notify_parent_about_destruction = false; + } + template + allocator_slice make_allocator() { + return allocator_slice(*this); + } + bool sync_aggregation_slices() { + assert(parent.slices_exhausted == true); + auto ret = parent.sync_aggregation_slices(launch_counter); + launch_counter++; + return ret; + } + template void post(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + parent.post(launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + } + template + hpx::lcos::future async(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + hpx::lcos::future ret_fut = parent.async( + launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + return ret_fut; + } + + // OneWay Execution + template + friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t, + executor_slice& exec, F&& f, Ts&&... ts) + { + return exec.post(std::forward(f), std::forward(ts)...); + } + + // TwoWay Execution + template + friend decltype(auto) tag_invoke( + hpx::parallel::execution::async_execute_t, executor_slice& exec, + F&& f, Ts&&... ts) + { + return exec.async( + std::forward(f), std::forward(ts)...); + } + + template + hpx::lcos::shared_future wrap_async(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + hpx::lcos::shared_future ret_fut = parent.wrap_async( + launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + return ret_fut; + } + + /// Get new aggregated buffer (might have already been allocated been + /// allocated by different slice) + template T *get(const size_t size) { + assert(parent.slices_exhausted == true); + T *aggregated_buffer = + parent.get(size, buffer_counter); + buffer_counter++; + assert(buffer_counter > 0); + return aggregated_buffer; + } + + Executor& get_underlying_executor(void) { + assert(parent.executor_wrapper); + return *(parent.executor_wrapper); + } + }; + + // deprecated name... + using Executor_Slice [[deprectated("Renamed: Use executor_slice instead")]] = executor_slice; + + //=============================================================================== + + hpx::lcos::local::promise slices_full_promise; + /// Promises with the slice executors -- to be set when the starting criteria + /// is met + std::vector> executor_slices; + /// List of aggregated function calls - function will be launched when all + /// slices have called it + std::deque> function_calls; + /// For synchronizing the access to the function calls list + aggregation_mutex_t mut; + + /// Data entry for a buffer allocation: void* pointer, size_t for + /// buffer-size, atomic for the slice counter, location_id, gpu_id + using buffer_entry_t = + std::tuple, bool, const size_t, size_t>; + /// Keeps track of the aggregated buffer allocations done in all the slices + std::deque buffer_allocations; + /// Map pointer to deque index for fast access in the deallocations + std::unordered_map buffer_allocations_map; + /// For synchronizing the access to the buffer_allocations + aggregation_mutex_t buffer_mut; + std::atomic buffer_counter = 0; + + /// Get new buffer OR get buffer already allocated by different slice + template + T *get(const size_t size, const size_t slice_alloc_counter) { + assert(slices_exhausted == true); + assert(executor_wrapper); + assert(executor_slices_alive == true); + // Add aggreated buffer entry in case it hasn't happened yet for this call + // First: Check if it already has happened + if (buffer_counter <= slice_alloc_counter) { + // we might be the first! Lock... + std::lock_guard guard(buffer_mut); + // ... and recheck + if (buffer_counter <= slice_alloc_counter) { + constexpr bool manage_content_lifetime = false; + buffers_in_use = true; + + // Default location -- useful for GPU builds as we otherwise create way too + // many different buffers for different aggregation sizes on different GPUs + /* size_t location_id = gpu_id * instances_per_gpu; */ + // Use integer conversion to only use 0 16 32 ... as buckets + size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; +#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + if (max_slices == 1) { + // get prefered location: aka the current hpx threads location + // Usually handy for CPU builds where we want to use the buffers + // close to the current CPU core + /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ + /* location_id = (gpu_id) * instances_per_gpu; */ + // division makes sure that we always use the same instance to store our gpu buffers. + } +#endif + // Get shiny and new buffer that will be shared between all slices + // Buffer might be recycled from previous allocations by the + // buffer_interface... + T *aggregated_buffer = + cppuddle::memory_recycling::detail::buffer_interface::get< + T, Host_Allocator>(size, manage_content_lifetime, location_id, + gpu_id); + // Create buffer entry for this buffer + buffer_allocations.emplace_back(static_cast(aggregated_buffer), + size, 1, true, location_id, gpu_id); + +#ifndef NDEBUG + // if previousely used the buffer should not be in usage anymore + const auto exists = buffer_allocations_map.count( + static_cast(aggregated_buffer)); + if (exists > 0) { + const auto previous_usage_id = + buffer_allocations_map[static_cast(aggregated_buffer)]; + const auto &valid = + std::get<3>(buffer_allocations[previous_usage_id]); + assert(!valid); + } +#endif + buffer_allocations_map.insert_or_assign(static_cast(aggregated_buffer), + buffer_counter); + + assert (buffer_counter == slice_alloc_counter); + buffer_counter = buffer_allocations.size(); + + // Return buffer + return aggregated_buffer; + } + } + assert(buffers_in_use == true); + assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid + assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1); + + // Buffer entry should already exist: + T *aggregated_buffer = static_cast( + std::get<0>(buffer_allocations[slice_alloc_counter])); + // Error handling: Size is wrong? + assert(size == std::get<1>(buffer_allocations[slice_alloc_counter])); + // Notify that one more slice has visited this buffer allocation + std::get<2>(buffer_allocations[slice_alloc_counter])++; + return aggregated_buffer; + } + + /// Notify buffer list that one slice is done with the buffer + template + void mark_unused(T *p, const size_t size) { + assert(slices_exhausted == true); + assert(executor_wrapper); + + void *ptr_key = static_cast(p); + size_t slice_alloc_counter = buffer_allocations_map[p]; + + assert(slice_alloc_counter < buffer_allocations.size()); + /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = + buffer_allocations[slice_alloc_counter];*/ + auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); + const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); + auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); + auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); + const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); + const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]); + assert(valid); + T *buffer_pointer = static_cast(buffer_pointer_void); + + assert(buffer_size == size); + assert(p == buffer_pointer); + // assert(buffer_pointer == p || buffer_pointer == nullptr); + // Slice is done with this buffer + buffer_allocation_counter--; + // Check if all slices are done with this buffer? + if (buffer_allocation_counter == 0) { + // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage + std::lock_guard guard(buffer_mut); + // Only mark unused if another buffer has not done so already (and marked + // it as invalid) + if (valid) { + assert(buffers_in_use == true); + cppuddle::memory_recycling::detail::buffer_interface::mark_unused< + T, Host_Allocator>(buffer_pointer, buffer_size, location_id, + gpu_id); + // mark buffer as invalid to prevent any other slice from marking the + // buffer as unused + valid = false; + + const size_t current_deallocs = ++dealloc_counter; + if (current_deallocs == buffer_counter) { + std::lock_guard guard(mut); + buffers_in_use = false; + if (!executor_slices_alive && !buffers_in_use) { + slices_exhausted = false; + // Release executor + executor_wrapper.reset(nullptr); + } + } + } + } + } + + //=============================================================================== + // Public Interface +public: + hpx::lcos::future current_continuation; + hpx::lcos::future last_stream_launch_done; + std::atomic overall_launch_counter = 0; + + /// Only meant to be accessed by the slice executors + bool sync_aggregation_slices(const size_t slice_launch_counter) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, false, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].sync_aggregation_slices( + last_stream_launch_done); + } + } + + return function_calls[slice_launch_counter].sync_aggregation_slices( + last_stream_launch_done); + } + + /// Only meant to be accessed by the slice executors + template + void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, false, *executor_wrapper); + overall_launch_counter = function_calls.size(); + function_calls[slice_launch_counter].post_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + return; + } + } + + function_calls[slice_launch_counter].post_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + return; + } + + /// Only meant to be accessed by the slice executors + template + hpx::lcos::future async(const size_t slice_launch_counter, F &&f, + Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, true, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].async_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + } + + return function_calls[slice_launch_counter].async_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + /// Only meant to be accessed by the slice executors + template + hpx::lcos::shared_future wrap_async(const size_t slice_launch_counter, F &&f, + Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, true, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].wrap_async( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + } + + return function_calls[slice_launch_counter].wrap_async( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + + bool slice_available(void) { + std::lock_guard guard(mut); + return !slices_exhausted; + } + + std::optional> request_executor_slice() { + std::lock_guard guard(mut); + if (!slices_exhausted) { + const size_t local_slice_id = ++current_slices; + if (local_slice_id == 1) { + // Cleanup leftovers from last run if any + // TODO still required? Should be clean here already + function_calls.clear(); + overall_launch_counter = 0; + std::lock_guard guard(buffer_mut); +#ifndef NDEBUG + for (const auto &buffer_entry : buffer_allocations) { + const auto &[buffer_pointer_any, buffer_size, + buffer_allocation_counter, valid, location_id, device_id] = + buffer_entry; + assert(!valid); + } +#endif + buffer_allocations.clear(); + buffer_allocations_map.clear(); + buffer_counter = 0; + + assert(executor_slices_alive == false); + assert(buffers_in_use == false); + executor_slices_alive = true; + buffers_in_use = false; + dealloc_counter = 0; + + if (mode == aggregated_executor_modes::STRICT ) { + slices_full_promise = hpx::lcos::local::promise{}; + } + } + + // Create Executor Slice future -- that will be returned later + hpx::lcos::future ret_fut; + if (local_slice_id < max_slices) { + executor_slices.emplace_back(hpx::lcos::local::promise{}); + ret_fut = + executor_slices[local_slice_id - 1].get_future(); + } else { + launched_slices = current_slices; + ret_fut = hpx::make_ready_future(executor_slice{*this, + executor_slices.size(), launched_slices}); + } + + // Are we the first slice? If yes, add continuation set the + // executor_slice + // futures to ready if the launch conditions are met + if (local_slice_id == 1) { + // Redraw executor + assert(!executor_wrapper); + cppuddle::executor_recycling::executor_pool::select_device< + Executor, cppuddle::executor_recycling::round_robin_pool_impl>( + gpu_id); + executor_wrapper.reset( + new cppuddle::executor_recycling::executor_interface< + Executor, + cppuddle::executor_recycling::round_robin_pool_impl>( + gpu_id)); + // Renew promise that all slices will be ready as the primary launch + // criteria... + hpx::lcos::shared_future fut; + if (mode == aggregated_executor_modes::EAGER || + mode == aggregated_executor_modes::ENDLESS) { + // Fallback launch condidtion: Launch as soon as the underlying stream + // is ready + /* auto slices_full_fut = slices_full_promise.get_future(); */ + cppuddle::executor_recycling::executor_pool::select_device< + Executor, + cppuddle::executor_recycling::round_robin_pool_impl>(gpu_id); + auto exec_fut = (*executor_wrapper).get_future(); + /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */ + fut = std::move(exec_fut); + } else { + auto slices_full_fut = slices_full_promise.get_shared_future(); + // Just use the slices launch condition + fut = std::move(slices_full_fut); + } + // Launch all executor slices within this continuation + current_continuation = fut.then([this](auto &&fut) { + std::lock_guard guard(mut); + slices_exhausted = true; + launched_slices = current_slices; + size_t id = 0; + for (auto &slice_promise : executor_slices) { + slice_promise.set_value( + executor_slice{*this, id, launched_slices}); + id++; + } + executor_slices.clear(); + }); + } + if (local_slice_id >= max_slices && + mode != aggregated_executor_modes::ENDLESS) { + slices_exhausted = true; // prevents any more threads from entering + // before the continuation is launched + /* launched_slices = current_slices; */ + /* size_t id = 0; */ + /* for (auto &slice_promise : executor_slices) { */ + /* slice_promise.set_value( */ + /* executor_slice{*this, id, launched_slices}); */ + /* id++; */ + /* } */ + /* executor_slices.clear(); */ + if (mode == aggregated_executor_modes::STRICT ) { + slices_full_promise.set_value(); // Trigger slices launch condition continuation + } + // that continuation will set all executor slices so far handed out to ready + } + return ret_fut; + } else { + // Return empty optional as failure + return std::optional>{}; + } + } + size_t launched_slices; + void reduce_usage_counter(void) { + /* std::lock_guard guard(mut); */ + assert(slices_exhausted == true); + assert(executor_wrapper); + assert(executor_slices_alive == true); + assert(launched_slices >= 1); + assert(current_slices >= 0 && current_slices <= launched_slices); + const size_t local_slice_id = --current_slices; + // Last slice goes out scope? + if (local_slice_id == 0) { + // Mark executor fit for reusage + std::lock_guard guard(mut); + executor_slices_alive = false; + if (!executor_slices_alive && !buffers_in_use) { + // Release executor + slices_exhausted = false; + executor_wrapper.reset(nullptr); + } + } + } + ~aggregated_executor(void) { + + assert(current_slices == 0); + assert(executor_slices_alive == false); + assert(buffers_in_use == false); + + if (mode != aggregated_executor_modes::STRICT ) { + slices_full_promise.set_value(); // Trigger slices launch condition continuation + } + + // Cleanup leftovers from last run if any + function_calls.clear(); + overall_launch_counter = 0; +#ifndef NDEBUG + for (const auto &buffer_entry : buffer_allocations) { + const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, + valid, location_id, device_id] = buffer_entry; + assert(!valid); + } +#endif + buffer_allocations.clear(); + buffer_allocations_map.clear(); + buffer_counter = 0; + + assert(buffer_allocations.empty()); + assert(buffer_allocations_map.empty()); + } + + aggregated_executor(const size_t number_slices, + aggregated_executor_modes mode, const size_t gpu_id = 0) + : max_slices(number_slices), current_slices(0), slices_exhausted(false), + dealloc_counter(0), mode(mode), executor_slices_alive(false), + buffers_in_use(false), gpu_id(gpu_id), + executor_wrapper(nullptr), + current_continuation(hpx::make_ready_future()), + last_stream_launch_done(hpx::make_ready_future()) {} + // Not meant to be copied or moved + aggregated_executor(const aggregated_executor &other) = delete; + aggregated_executor &operator=(const aggregated_executor &other) = delete; + aggregated_executor(aggregated_executor &&other) = delete; + aggregated_executor &operator=(aggregated_executor &&other) = delete; +}; + +template +class allocator_slice { +private: + typename aggregated_executor::executor_slice &executor_reference; + aggregated_executor &executor_parent; + +public: + using value_type = T; + allocator_slice( + typename aggregated_executor::executor_slice &executor) + : executor_reference(executor), executor_parent(executor.parent) {} + template + explicit allocator_slice( + allocator_slice const &) noexcept {} + T *allocate(std::size_t n) { + T *data = executor_reference.template get(n); + return data; + } + void deallocate(T *p, std::size_t n) { + /* executor_reference.template mark_unused(p, n); */ + executor_parent.template mark_unused(p, n); + } + template + inline void construct(T *p, Args... args) noexcept { + // Do nothing here - we reuse the content of the last owner + } + void destroy(T *p) { + // Do nothing here - Contents will be destroyed when the buffer manager is + // destroyed, not before + } +}; +template +constexpr bool +operator==(allocator_slice const &, + allocator_slice const &) noexcept { + return false; +} +template +constexpr bool +operator!=(allocator_slice const &, + allocator_slice const &) noexcept { + return true; +} + +} // namespace detail +} // namespace kernel_aggregation +} // namespace cppuddle + + + +namespace hpx { namespace parallel { namespace execution { + // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with + // compatible unlying executor types + /* template */ + /* struct is_one_way_executor::executor_slice> */ + /* : std::true_type */ + /* {}; */ + /* template */ + /* struct is_two_way_executor::executor_slice> */ + /* : std::true_type */ + /* {}; */ + +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) + // Workaround for the meantime: Manually create traits for compatible types: +template <> +struct is_one_way_executor< + typename cppuddle::kernel_aggregation::detail::aggregated_executor< + hpx::cuda::experimental::cuda_executor>::executor_slice> + : std::true_type {}; +template <> +struct is_two_way_executor< + typename cppuddle::kernel_aggregation::detail::aggregated_executor< + hpx::cuda::experimental::cuda_executor>::executor_slice> + : std::true_type {}; +#endif +}}} + +#endif diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp new file mode 100644 index 00000000..c7a3b633 --- /dev/null +++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp @@ -0,0 +1,34 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef KERNEL_AGGREGATION_INTERFACE_HPP +#define KERNEL_AGGREGATION_INTERFACE_HPP + +#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp" +#include "cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp" + +namespace cppuddle { +namespace kernel_aggregation { + +using aggregated_executor_modes = + cppuddle::kernel_aggregation::detail::aggregated_executor_modes; + +template +using allocator_slice = + cppuddle::kernel_aggregation::detail::allocator_slice; + +template +using aggregated_executor = + cppuddle::kernel_aggregation::detail::aggregated_executor; + +template +using aggregation_pool = + cppuddle::kernel_aggregation::detail::aggregation_pool; + +} // namespace kernel_aggregation +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp new file mode 100644 index 00000000..a824e7e0 --- /dev/null +++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp @@ -0,0 +1,40 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP +#define ALIGNED_RECYCLING_ALLOCATORS_HPP + +#include +#include "buffer_management_interface.hpp" + +namespace cppuddle { +namespace memory_recycling { + +namespace device_selection { +template +/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / +/// select_device_functor does not compile for > 1 GPU (to make sure all / +/// relevant allocators support multigpu) +struct select_device_functor< + T, boost::alignment::aligned_allocator> { + void operator()(const size_t device_id) {} +}; +} // namespace device_selection + +/// Recycling allocator for boost aligned memory +template ::value, int> = 0> +using recycle_aligned = detail::recycle_allocator< + T, boost::alignment::aligned_allocator>; +/// Recycling allocator for boost aligned memory (reusing previous content as well) +template ::value, int> = 0> +using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< + T, boost::alignment::aligned_allocator>; + +} // namespace memory_recycling +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp new file mode 100644 index 00000000..c5fa44cd --- /dev/null +++ b/include/cppuddle/memory_recycling/buffer_management_interface.hpp @@ -0,0 +1,33 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BUFFER_MANAGEMENT_INTERFACE_HPP +#define BUFFER_MANAGEMENT_INTERFACE_HPP + +#include "detail/buffer_management.hpp" + +namespace cppuddle { +namespace memory_recycling { + +/// Print performance counters of all buffer managers to stdout +inline void print_buffer_counters() { + detail::buffer_interface::print_performance_counters(); +} +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself +inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); } + +/// Deletes all buffers currently marked as unused +inline void unused_buffer_cleanup() { + detail::buffer_interface::clean_unused_buffers(); +} +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself. Disallows further usage. +inline void finalize() { detail::buffer_interface::finalize(); } + +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp new file mode 100644 index 00000000..b47a4fe2 --- /dev/null +++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_RECYCLING_ALLOCATORS_HPP +#define CUDA_RECYCLING_ALLOCATORS_HPP + +#include "buffer_management_interface.hpp" +// import cuda_pinned_allocator and cuda_device_allocator +#include "detail/cuda_underlying_allocators.hpp" + +namespace cppuddle { +namespace memory_recycling { + +// Tell cppuddle how to select the device for the cuda allocators +namespace device_selection { +/// GPU device selector using the CUDA API for pinned host allocations +template +struct select_device_functor> { + void operator()(const size_t device_id) { cudaSetDevice(device_id); } +}; +/// GPU selector using the CUDA API for pinned host allocations +template +struct select_device_functor> { + void operator()(const size_t device_id) { cudaSetDevice(device_id); } +}; +} // namespace device_selection + +/// Recycling allocator for CUDA pinned host memory +template ::value, int> = 0> +using recycle_allocator_cuda_host = + detail::aggressive_recycle_allocator>; +/// Recycling allocator for CUDA device memory +template ::value, int> = 0> +using recycle_allocator_cuda_device = + detail::recycle_allocator>; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp new file mode 100644 index 00000000..7c30c781 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp @@ -0,0 +1,929 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BUFFER_MANAGEMENT_HPP +#define BUFFER_MANAGEMENT_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Warn about suboptimal performance without correct HPX-aware allocators +#ifdef CPPUDDLE_HAVE_HPX +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS +#pragma message \ +"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \ +For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!" +#else +// include runtime to get HPX thread IDs required for the HPX-aware allocators +#include +#endif +#endif + +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) +// For builds with The HPX mutex +#include +#endif + +#ifdef CPPUDDLE_HAVE_COUNTERS +#include +#if defined(CPPUDDLE_HAVE_HPX) +#include +#endif +#endif + +#include "cppuddle/common/config.hpp" + +namespace cppuddle { +namespace memory_recycling { + +namespace device_selection { +/// Default device selector - No MultGPU support +/** Throws a runtime error if max_number_gpus > 1 (defined by cmake variable + * CPPUDDLE_WITH_MAX_NUMBER_GPUS). Needs to be specialized for an allocator to + * provide MultiGPU support (see CPPuddle CUDA/HIP allocators for examples) **/ +template struct select_device_functor { + void operator()(const size_t device_id) { + if constexpr (max_number_gpus > 1) + throw std::runtime_error( + "Allocators used in Multi-GPU builds need explicit Multi-GPU support " + "(by having a select_device_functor overload"); + } +}; +} // namespace device_selection + +namespace detail { + +/// Singleton interface to all buffer_managers +class buffer_interface { +public: +#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) + +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without buffer recycling! Use only for performance testing! \ +For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!" + + template + static T *get(size_t number_elements, bool manage_content_lifetime = false, + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { + + return Host_Allocator{}.allocate(number_elements); + } + /// Marks an buffer as unused and fit for reusage + template + static void mark_unused(T *p, size_t number_elements, + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { + return Host_Allocator{}.deallocate(p, number_elements); + } +#else + /// Primary method to allocate a buffer with CPPuddle: Returns and allocated / + /// buffer of the requested size - this may be a reused buffer. The method + /// figures out the correct buffer_manager and gets such a buffer from it. + /// Should be called from an allocator implementation, not directly + template + static T *get(size_t number_elements, bool manage_content_lifetime = false, + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { + try { + return buffer_manager::get( + number_elements, manage_content_lifetime, location_hint, device_id); + } catch (const std::exception &exc) { + std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl; + std::cerr << "Rethrowing exception... " << std::endl;; + throw; + } + } + /// Primary method to deallocate a buffer with CPPuddle:Marks an buffer as / + /// unused and fit for reusage. The method figures out the correct buffer + /// manager and marks the buffer there. Should be called from an allocator + /// implementation, not directly + template + static void mark_unused(T *p, size_t number_elements, + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { + try { + return buffer_manager::mark_unused(p, number_elements, + location_hint, device_id); + } catch (const std::exception &exc) { + std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl; + std::cerr << "Rethrowing exception... " << std::endl;; + throw; + } + } +#endif + /// Register all CPPuddle counters as HPX performance counters + template + static void register_allocator_counters_with_hpx(void) { +#ifdef CPPUDDLE_HAVE_COUNTERS + buffer_manager::register_counters_with_hpx(); +#else + std::cerr << "Warning: Trying to register allocator performance counters " + "with HPX but CPPuddle was built " + "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" + << std::endl; +#endif + } + + /// Deallocate all buffers, no matter whether they are marked as used or not + static void clean_all() { + std::lock_guard guard(instance().callback_protection_mut); + for (const auto &clean_function : + instance().total_cleanup_callbacks) { + clean_function(); + } + } + /// Deallocated all currently unused buffer + static void clean_unused_buffers() { + std::lock_guard guard(instance().callback_protection_mut); + for (const auto &clean_function : + instance().partial_cleanup_callbacks) { + clean_function(); + } + } + /// Deallocate all buffers, no matter whether they are marked as used or not + static void finalize() { + std::lock_guard guard(instance().callback_protection_mut); + for (const auto &finalize_function : + instance().finalize_callbacks) { + finalize_function(); + } + } + + static void print_performance_counters() { +#ifdef CPPUDDLE_HAVE_COUNTERS + std::lock_guard guard(instance().callback_protection_mut); + for (const auto &print_function : + instance().print_callbacks) { + print_function(); + } +#else + std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built " + "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!" + << std::endl; +#endif + } + + // Member variables and methods +private: + + /// Singleton instance access + static buffer_interface& instance() { + static buffer_interface singleton{}; + return singleton; + } + /// Callbacks for printing the performance counter data + std::list> print_callbacks; + /// Callbacks for buffer_manager finalize - each callback completely destroys + /// one buffer_manager + std::list> finalize_callbacks; + /// Callbacks for buffer_manager cleanups - each callback destroys all buffers within + /// one buffer_manager, both used and unsued + std::list> total_cleanup_callbacks; + /// Callbacks for partial buffer_manager cleanups - each callback deallocates + /// all unused buffers of a manager + std::list> partial_cleanup_callbacks; + /// default, private constructor - not automatically constructed due to the + /// deleted constructors + buffer_interface() = default; + + mutex_t callback_protection_mut; + /// Add a callback function that gets executed upon cleanup and destruction + static void add_total_cleanup_callback(const std::function &func) { + std::lock_guard guard(instance().callback_protection_mut); + instance().total_cleanup_callbacks.push_back(func); + } + /// Add a callback function that gets executed upon partial (unused memory) + /// cleanup + static void add_partial_cleanup_callback(const std::function &func) { + std::lock_guard guard(instance().callback_protection_mut); + instance().partial_cleanup_callbacks.push_back(func); + } + /// Add a callback function that gets executed upon partial (unused memory) + /// cleanup + static void add_finalize_callback(const std::function &func) { + std::lock_guard guard(instance().callback_protection_mut); + instance().finalize_callbacks.push_back(func); + } + /// Add a callback function that gets executed upon partial (unused memory) + /// cleanup + static void add_print_callback(const std::function &func) { + std::lock_guard guard(instance().callback_protection_mut); + instance().print_callbacks.push_back(func); + } + +public: + ~buffer_interface() = default; + + // Subclasses +private: + /// Memory Manager subclass to handle buffers a specific type + template class buffer_manager { + private: + // Tuple content: Pointer to buffer, buffer_size, location ID, Flag + // The flag at the end controls whether to buffer content is to be reused as + // well + using buffer_entry_type = std::tuple; + + + public: + /// Cleanup and delete this singleton + static void clean() { + assert(instance() && !is_finalized); + for (auto i = 0; i < number_instances * max_number_gpus; i++) { + std::lock_guard guard(instance()[i].mut); + instance()[i].clean_all_buffers(); + } + } + static void print_performance_counters() { + assert(instance() && !is_finalized); + for (auto i = 0; i < number_instances * max_number_gpus; i++) { + std::lock_guard guard(instance()[i].mut); + instance()[i].print_counters(); + } + } + static void finalize() { + assert(instance() && !is_finalized); + is_finalized = true; + for (auto i = 0; i < number_instances * max_number_gpus; i++) { + std::lock_guard guard(instance()[i].mut); + instance()[i].clean_all_buffers(); + } + instance().reset(); + } + /// Cleanup all buffers not currently in use + static void clean_unused_buffers_only() { + assert(instance() && !is_finalized); + for (auto i = 0; i < number_instances * max_number_gpus; i++) { + std::lock_guard guard(instance()[i].mut); + for (auto &buffer_tuple : instance()[i].unused_buffer_list) { + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + instance()[i].unused_buffer_list.clear(); + } + } +#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX) + static size_t get_sum_number_recycling(bool reset) { + if (reset) + sum_number_recycling = 0; + return sum_number_recycling; + } + static size_t get_sum_number_allocation(bool reset) { + if (reset) + sum_number_allocation = 0; + return sum_number_allocation; + } + static size_t get_sum_number_creation(bool reset) { + if (reset) + sum_number_creation = 0; + return sum_number_creation; + } + static size_t get_sum_number_deallocation(bool reset) { + if (reset) + sum_number_deallocation = 0; + return sum_number_deallocation; + } + static size_t get_sum_number_wrong_hints(bool reset) { + if (reset) + sum_number_wrong_hints = 0; + return sum_number_wrong_hints; + } + static size_t get_sum_number_wrong_device_hints(bool reset) { + if (reset) + sum_number_wrong_hints = 0; + return sum_number_wrong_device_hints; + } + static size_t get_sum_number_bad_allocs(bool reset) { + if (reset) + sum_number_bad_allocs = 0; + return sum_number_bad_allocs; + } + + static void register_counters_with_hpx(void) { + std::string alloc_name = + boost::core::demangle(typeid(Host_Allocator).name()) + + std::string("_") + boost::core::demangle(typeid(T).name()); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"), + &get_sum_number_recycling, + "Number of allocations using a recycled buffer with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"), + &get_sum_number_allocation, + "Number of allocations with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"), + &get_sum_number_creation, + "Number of allocations not using a recycled buffer with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"), + &get_sum_number_deallocation, + "Number of deallocations yielding buffers to be recycled with this " + "allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"), + &get_sum_number_wrong_hints, + "Number of wrong hints supplied to the dealloc method with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"), + &get_sum_number_wrong_device_hints, + "Number of wrong device hints supplied to the dealloc method with this allocator"); + hpx::performance_counters::install_counter_type( + std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"), + &get_sum_number_bad_allocs, + "Number of wrong bad allocs which triggered a cleanup of unused buffers"); + } +#endif + + /// Tries to recycle or create a buffer of type T and size number_elements. + static T *get(size_t number_of_elements, bool manage_content_lifetime, + std::optional location_hint = std::nullopt, + std::optional gpu_device_id = std::nullopt) { + init_callbacks_once(); + if (is_finalized) { + throw std::runtime_error("Tried allocation after finalization"); + } + assert(instance() && !is_finalized); + + size_t location_id = 0; + if (location_hint) { + location_id = *location_hint; + } + if (location_id >= number_instances) { + throw std::runtime_error("Tried to create buffer with invalid location_id [get]"); + } + size_t device_id = 0; + if (gpu_device_id) { + device_id = *gpu_device_id; + } + if (device_id >= max_number_gpus) { + throw std::runtime_error("Tried to create buffer with invalid device id [get]! " + "Is multigpu support enabled with the correct number " + "of GPUs?"); + } + + location_id = location_id + device_id * number_instances; + std::lock_guard guard(instance()[location_id].mut); + + +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_allocation++; + sum_number_allocation++; +#endif + // Check for unused buffers we can recycle: + for (auto iter = instance()[location_id].unused_buffer_list.begin(); + iter != instance()[location_id].unused_buffer_list.end(); iter++) { + auto tuple = *iter; + if (std::get<1>(tuple) == number_of_elements) { + instance()[location_id].unused_buffer_list.erase(iter); + + // handle the switch from aggressive to non aggressive reusage (or + // vice-versa) + if (manage_content_lifetime && !std::get<3>(tuple)) { + std::uninitialized_value_construct_n(std::get<0>(tuple), + number_of_elements); + std::get<3>(tuple) = true; + } else if (!manage_content_lifetime && std::get<3>(tuple)) { + std::destroy_n(std::get<0>(tuple), std::get<1>(tuple)); + std::get<3>(tuple) = false; + } + instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple}); +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_recycling++; + sum_number_recycling++; +#endif + return std::get<0>(tuple); + } + } + + // No unused buffer found -> Create new one and return it + try { + cppuddle::memory_recycling::device_selection::select_device_functor< + T, Host_Allocator>{}(device_id); + Host_Allocator alloc; + T *buffer = alloc.allocate(number_of_elements); + instance()[location_id].buffer_map.insert( + {buffer, std::make_tuple(buffer, number_of_elements, 1, + manage_content_lifetime)}); +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_creation++; + sum_number_creation++; +#endif + if (manage_content_lifetime) { + std::uninitialized_value_construct_n(buffer, number_of_elements); + } + return buffer; + } catch (std::bad_alloc &e) { + // not enough memory left! Cleanup and attempt again: + std::cerr + << "Not enough memory left. Cleaning up unused buffers now..." + << std::endl; + buffer_interface::clean_unused_buffers(); + std::cerr << "Buffers cleaned! Try allocation again..." << std::endl; + + // If there still isn't enough memory left, the caller has to handle it + // We've done all we can in here + Host_Allocator alloc; + cppuddle::memory_recycling::device_selection::select_device_functor< + T, Host_Allocator>{}(device_id); + T *buffer = alloc.allocate(number_of_elements); + instance()[location_id].buffer_map.insert( + {buffer, std::make_tuple(buffer, number_of_elements, 1, + manage_content_lifetime)}); +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_creation++; + sum_number_creation++; + instance()[location_id].number_bad_alloc++; + sum_number_bad_allocs++; +#endif + std::cerr << "Second attempt allocation successful!" << std::endl; + if (manage_content_lifetime) { + std::uninitialized_value_construct_n(buffer, number_of_elements); + } + return buffer; + } + } + + static void mark_unused(T *memory_location, size_t number_of_elements, + std::optional location_hint = std::nullopt, + std::optional device_hint = std::nullopt) { + if (is_finalized) + return; + assert(instance() && !is_finalized); + + size_t location_id = 0; + if (location_hint) { + location_id = *location_hint; + if (location_id >= number_instances) { + throw std::runtime_error( + "Buffer recylcer received invalid location hint [mark_unused]"); + } + } + size_t device_id = 0; + if (device_hint) { + device_id = *device_hint; + if (device_id >= max_number_gpus) { + throw std::runtime_error( + "Buffer recylcer received invalid devce hint [mark_unused]"); + } + } + + // Attempt 1 to find the correct bucket/location: Look at provided hint: + if (location_hint) { + size_t location_id = location_hint.value() + device_id * number_instances; + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + // hint was wrong +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_wrong_hints++; + sum_number_wrong_hints++; +#endif + } + // Failed to find buffer in the specified localtion/device! + // Attempt 2 - Look for buffer other locations on the same device... + for (size_t location_id = device_id * number_instances; + location_id < (device_id + 1) * number_instances; location_id++) { + if (location_hint) { + if (*location_hint + device_id * max_number_gpus == location_id) { + continue; // already tried this -> skip + } + } + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + } + // device hint was wrong +#ifdef CPPUDDLE_HAVE_COUNTERS + if (device_hint) { + sum_number_wrong_device_hints++; + } +#endif + // Failed to find buffer on the specified device! + // Attempt 3 - Look for buffer on other devices... + for (size_t local_device_id = 0; local_device_id < max_number_gpus; + local_device_id++) { + if (local_device_id == device_id) + continue; // aldready tried this device + + // Try hint localtion first yet again (though on different device) + if (location_hint) { + size_t location_id = location_hint.value() + local_device_id * number_instances; + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + } + // Failed - check all other localtions on device + for (size_t location_id = local_device_id * number_instances; + location_id < (local_device_id + 1) * number_instances; location_id++) { + if (location_hint) { + if (*location_hint + local_device_id * max_number_gpus == location_id) { + continue; // already tried this -> skip + } + } + std::lock_guard guard(instance()[location_id].mut); + if (instance()[location_id].buffer_map.find(memory_location) != + instance()[location_id].buffer_map.end()) { +#ifdef CPPUDDLE_HAVE_COUNTERS + instance()[location_id].number_deallocation++; + sum_number_deallocation++; +#endif + auto it = instance()[location_id].buffer_map.find(memory_location); + assert(it != instance()[location_id].buffer_map.end()); + auto &tuple = it->second; + // sanity checks: + assert(std::get<1>(tuple) == number_of_elements); + // move to the unused_buffer list + instance()[location_id].unused_buffer_list.push_front(tuple); + instance()[location_id].buffer_map.erase(memory_location); + return; // Success + } + } + } + // Buffer that is to be deleted is nowhere to be found - we looked everywhere! + // => + // Failure! Handle here... + + // TODO Throw exception instead in the futures, as soon as the recycler finalize is + // in all user codes + /* throw std::runtime_error("Tried to delete non-existing buffer"); */ + + // This is odd: Print warning -- however, might also happen with static + // buffers using these allocators IF the new finalize was not called. For + // now, print warning until all user-code is upgraded to the finalize method. + // This allows using current versions of cppuddle with older application code + std::cerr + << "Warning! Tried to delete non-existing buffer within CPPuddle!" + << std::endl; + std::cerr << "Did you forget to call recycler::finalize?" << std::endl; + } + + private: + /// List with all buffers still in usage + std::unordered_map buffer_map{}; + /// List with all buffers currently not used + std::list unused_buffer_list{}; + /// Access control + mutex_t mut; +#ifdef CPPUDDLE_HAVE_COUNTERS + /// Performance counters + size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0}, + number_recycling{0}, number_creation{0}, number_bad_alloc{0}; + + static inline std::atomic sum_number_allocation{0}, + sum_number_deallocation{0}, sum_number_wrong_hints{0}, + sum_number_wrong_device_hints{0}, sum_number_recycling{0}, + sum_number_creation{0}, sum_number_bad_allocs{0}; +#endif + /// default, private constructor - not automatically constructed due to + /// the deleted constructors + buffer_manager() = default; + buffer_manager& + operator=(buffer_manager const &other) = default; + buffer_manager& + operator=(buffer_manager &&other) = delete; + static std::unique_ptr& instance(void) { + static std::unique_ptr instances{ + new buffer_manager[number_instances * max_number_gpus]}; + return instances; + } + static void init_callbacks_once(void) { + assert(instance()); +#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) + static hpx::once_flag flag; + hpx::call_once(flag, []() { +#else + static std::once_flag flag; + std::call_once(flag, []() { +#endif + is_finalized = false; + buffer_interface::add_total_cleanup_callback(clean); + buffer_interface::add_partial_cleanup_callback( + clean_unused_buffers_only); + buffer_interface::add_finalize_callback( + finalize); +#ifdef CPPUDDLE_HAVE_COUNTERS + buffer_interface::add_print_callback( + print_performance_counters); +#endif + }); + } + static inline std::atomic is_finalized; + +#ifdef CPPUDDLE_HAVE_COUNTERS + void print_counters(void) { + if (number_allocation == 0) + return; + // Print performance counters + size_t number_cleaned = unused_buffer_list.size() + buffer_map.size(); + std::cout << "\nBuffer manager destructor for (Alloc: " + << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: " + << boost::core::demangle(typeid(T).name()) + << "):" << std::endl + << "--------------------------------------------------------------------" + << std::endl + << "--> Number of bad_allocs that triggered garbage " + "collection: " + << number_bad_alloc << std::endl + << "--> Number of buffers that got requested from this " + "manager: " + << number_allocation << std::endl + << "--> Number of times an unused buffer got recycled for a " + "request: " + << number_recycling << std::endl + << "--> Number of times a new buffer had to be created for a " + "request: " + << number_creation << std::endl + << "--> Number cleaned up buffers: " + " " + << number_cleaned << std::endl + << "--> Number wrong deallocation hints: " + " " + << number_wrong_hints << std::endl + << "--> Number of buffers that were marked as used upon " + "cleanup: " + << buffer_map.size() << std::endl + << "==> Recycle rate: " + " " + << static_cast(number_recycling) / number_allocation * + 100.0f + << "%" << std::endl; + } +#endif + + void clean_all_buffers(void) { +#ifdef CPPUDDLE_HAVE_COUNTERS + if (number_allocation == 0 && number_recycling == 0 && + number_bad_alloc == 0 && number_creation == 0 && + unused_buffer_list.empty() && buffer_map.empty()) { + return; + } +#endif + for (auto &buffer_tuple : unused_buffer_list) { + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + for (auto &map_tuple : buffer_map) { + auto buffer_tuple = map_tuple.second; + Host_Allocator alloc; + if (std::get<3>(buffer_tuple)) { + std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple)); + } + unused_buffer_list.clear(); + buffer_map.clear(); +#ifdef CPPUDDLE_HAVE_COUNTERS + number_allocation = 0; + number_recycling = 0; + number_bad_alloc = 0; + number_creation = 0; + number_wrong_hints = 0; +#endif + } + public: + ~buffer_manager() { + clean_all_buffers(); + } + + public: // Putting deleted constructors in public gives more useful error + // messages + // Bunch of constructors we don't need + buffer_manager( + buffer_manager const &other) = delete; + buffer_manager( + buffer_manager &&other) = delete; + }; + +public: + // Putting deleted constructors in public gives more useful error messages + // Bunch of constructors we don't need + buffer_interface(buffer_interface const &other) = delete; + buffer_interface& operator=(buffer_interface const &other) = delete; + buffer_interface(buffer_interface &&other) = delete; + buffer_interface& operator=(buffer_interface &&other) = delete; +}; + +template struct recycle_allocator { + using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v); + const std::optional dealloc_hint; + const std::optional device_id; + +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + recycle_allocator() noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + explicit recycle_allocator(size_t hint) noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + explicit recycle_allocator( + recycle_allocator const &other) noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + T *allocate(std::size_t n) { + T *data = buffer_interface::get(n); + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_interface::mark_unused(p, n); + } +#else + recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} + explicit recycle_allocator(const size_t device_id) noexcept + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} + explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept + : dealloc_hint(location_id), device_id(device_id) {} + explicit recycle_allocator( + recycle_allocator const &other) noexcept + : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} + T *allocate(std::size_t n) { + T *data = buffer_interface::get( + n, false, hpx::get_worker_thread_num() % number_instances, device_id); + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_interface::mark_unused(p, n, dealloc_hint, + device_id); + } +#endif + + template + inline void construct(T *p, Args... args) noexcept { + ::new (static_cast(p)) T(std::forward(args)...); + } + void destroy(T *p) { p->~T(); } +}; +template +constexpr bool +operator==(recycle_allocator const &, + recycle_allocator const &) noexcept { + if constexpr (std::is_same_v) + return true; + else + return false; +} +template +constexpr bool +operator!=(recycle_allocator const &, + recycle_allocator const &) noexcept { + if constexpr (std::is_same_v) + return false; + else + return true; +} + +/// Recycles not only allocations but also the contents of a buffer +template +struct aggressive_recycle_allocator { + using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v); + const std::optional dealloc_hint; + const std::optional device_id; + +#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + aggressive_recycle_allocator() noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + explicit aggressive_recycle_allocator(size_t hint) noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + explicit aggressive_recycle_allocator( + aggressive_recycle_allocator const &) noexcept + : dealloc_hint(std::nullopt), device_id(std::nullopt) {} + T *allocate(std::size_t n) { + T *data = buffer_interface::get( + n, true); // also initializes the buffer if it isn't reused + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_interface::mark_unused(p, n); + } +#else + aggressive_recycle_allocator() noexcept + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {} + explicit aggressive_recycle_allocator(const size_t device_id) noexcept + : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {} + explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept + : dealloc_hint(location_id), device_id(device_id) {} + explicit aggressive_recycle_allocator( + recycle_allocator const &other) noexcept + : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} + T *allocate(std::size_t n) { + T *data = buffer_interface::get( + n, true, dealloc_hint, device_id); // also initializes the buffer + // if it isn't reused + return data; + } + void deallocate(T *p, std::size_t n) { + buffer_interface::mark_unused(p, n, dealloc_hint, + device_id); + } +#endif + +#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS + template + inline void construct(T *p, Args... args) noexcept { + // Do nothing here - we reuse the content of the last owner + } + void destroy(T *p) { + // Do nothing here - Contents will be destroyed when the buffer manager is + // destroyed, not before + } +#else +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without content reusage for aggressive allocators! \ +For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !" + template + inline void construct(T *p, Args... args) noexcept { + ::new (static_cast(p)) T(std::forward(args)...); + } + void destroy(T *p) { p->~T(); } +#endif +}; + +template +constexpr bool +operator==(aggressive_recycle_allocator const &, + aggressive_recycle_allocator const &) noexcept { + if constexpr (std::is_same_v) + return true; + else + return false; +} +template +constexpr bool +operator!=(aggressive_recycle_allocator const &, + aggressive_recycle_allocator const &) noexcept { + if constexpr (std::is_same_v) + return false; + else + return true; +} +} // namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp new file mode 100644 index 00000000..ab1f8681 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp @@ -0,0 +1,101 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_UNDERLYING_ALLOCATORS_HPP +#define CUDA_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for CUDA pinned memory +template struct cuda_pinned_allocator { + using value_type = T; + cuda_pinned_allocator() noexcept = default; + template + explicit cuda_pinned_allocator(cuda_pinned_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + cudaError_t error = + cudaMallocHost(reinterpret_cast(&data), n * sizeof(T)); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_pinned_allocator failed due to cudaMallocHost failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + cudaError_t error = cudaFreeHost(p); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_pinned_allocator failed due to cudaFreeHost failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; + +template +constexpr bool operator==(cuda_pinned_allocator const &, + cuda_pinned_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(cuda_pinned_allocator const &, + cuda_pinned_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for CUDA device memory +template struct cuda_device_allocator { + using value_type = T; + cuda_device_allocator() noexcept = default; + template + explicit cuda_device_allocator(cuda_device_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + cudaError_t error = cudaMalloc(&data, n * sizeof(T)); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_device_allocator failed due to cudaMalloc failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + cudaError_t error = cudaFree(p); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_device_allocator failed due to cudaFree failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(cuda_device_allocator const &, + cuda_device_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(cuda_device_allocator const &, + cuda_device_allocator const &) noexcept { + return false; +} +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp new file mode 100644 index 00000000..bfd7c2e1 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp @@ -0,0 +1,107 @@ +// Copyright (c) 2021-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_UNDERLYING_ALLOCATORS_HPP +#define HIP_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for HIP pinned memory +template struct hip_pinned_allocator { + using value_type = T; + hip_pinned_allocator() noexcept = default; + template + explicit hip_pinned_allocator(hip_pinned_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + // hipError_t error = + // hipMallocHost(reinterpret_cast(&data), n * sizeof(T)); + + // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc + // (not hipMallocHost) for async memcpys + // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync + hipError_t error = + hipHostMalloc(reinterpret_cast(&data), n * sizeof(T)); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_pinned_allocator failed due to hipMallocHost failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + hipError_t error = hipHostFree(p); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_pinned_allocator failed due to hipFreeHost failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(hip_pinned_allocator const &, + hip_pinned_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(hip_pinned_allocator const &, + hip_pinned_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for HIP device memory +template struct hip_device_allocator { + using value_type = T; + hip_device_allocator() noexcept = default; + template + explicit hip_device_allocator(hip_device_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + hipError_t error = hipMalloc(&data, n * sizeof(T)); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_device_allocator failed due to hipMalloc failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + hipError_t error = hipFree(p); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_device_allocator failed due to hipFree failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(hip_device_allocator const &, + hip_device_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(hip_device_allocator const &, + hip_device_allocator const &) noexcept { + return false; +} + +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp new file mode 100644 index 00000000..3e3c9173 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp @@ -0,0 +1,74 @@ +// Copyright (c) 2023-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SYCL_UNDERLYING_ALLOCATORS_HPP +#define SYCL_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{}) +template struct sycl_host_default_allocator { + using value_type = T; + sycl_host_default_allocator() noexcept = default; + template + explicit sycl_host_default_allocator(sycl_host_default_allocator const &) noexcept {} + T *allocate(std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + T *data = cl::sycl::malloc_host(n, default_queue); + return data; + } + void deallocate(T *p, std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + cl::sycl::free(p, default_queue); + } +}; +template +constexpr bool operator==(sycl_host_default_allocator const &, + sycl_host_default_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(sycl_host_default_allocator const &, + sycl_host_default_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for SYCL device memory (using the sycl::default_selector{}) +template struct sycl_device_default_allocator { + using value_type = T; + sycl_device_default_allocator() noexcept = default; + template + explicit sycl_device_default_allocator(sycl_device_default_allocator const &) noexcept {} + T *allocate(std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + T *data = cl::sycl::malloc_device(n, default_queue); + return data; + } + void deallocate(T *p, std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + cl::sycl::free(p, default_queue); + } +}; +template +constexpr bool operator==(sycl_device_default_allocator const &, + sycl_device_default_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(sycl_device_default_allocator const &, + sycl_device_default_allocator const &) noexcept { + return false; +} + +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp new file mode 100644 index 00000000..13b5241b --- /dev/null +++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_RECYCLING_ALLOCATORS_HPP +#define HIP_RECYCLING_ALLOCATORS_HPP + +#include "buffer_management_interface.hpp" +// import hip_pinned_allocator and hip_device_allocator +#include "detail/hip_underlying_allocators.hpp" + +namespace cppuddle { +namespace memory_recycling { + +// Tell cppuddle how to select the device for the hip allocators +namespace device_selection { +/// GPU device selector using the HIP API for pinned host allocations +template +struct select_device_functor> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +/// GPU selector using the HIP API for pinned host allocations +template +struct select_device_functor> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +} // namespace device_selection + +/// Recycling allocator for HIP pinned host memory +template ::value, int> = 0> +using recycle_allocator_hip_host = + detail::aggressive_recycle_allocator>; +/// Recycling allocator for HIP device memory +template ::value, int> = 0> +using recycle_allocator_hip_device = + detail::recycle_allocator>; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/std_recycling_allocators.hpp b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp new file mode 100644 index 00000000..21fd5c2c --- /dev/null +++ b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp @@ -0,0 +1,35 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef STD_RECYCLING_ALLOCATORS_HPP +#define STD_RECYCLING_ALLOCATORS_HPP + +#include "buffer_management_interface.hpp" + +namespace cppuddle { +namespace memory_recycling { + +namespace device_selection { +/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / +/// select_device_functor does not compile for > 1 GPU (to make sure all / +/// relevant allocators support multigpu) +template struct select_device_functor> { + void operator()(const size_t device_id) {} +}; +} // namespace device_selection + + +/// Recycling allocator for std memory +template ::value, int> = 0> +using recycle_std = detail::recycle_allocator>; +/// Recycling allocator for boost aligned memory (reusing previous content as well) +template ::value, int> = 0> +using aggressive_recycle_std = + detail::aggressive_recycle_allocator>; + +} // namespace memory_recycling +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp new file mode 100644 index 00000000..fd494bca --- /dev/null +++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp @@ -0,0 +1,31 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SYCL_RECYCLING_ALLOCATORS_HPP +#define SYCL_RECYCLING_ALLOCATORS_HPP + +#include "buffer_management_interface.hpp" +#include "detail/sycl_underlying_allocators.hpp" + +namespace cppuddle { +namespace memory_recycling { + +namespace device_selection { +// No MutliGPU support yet, hence no select_device_function required +static_assert(max_number_gpus <= 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); +} // namespace device_selection + +/// Recycling allocator for SYCL pinned host memory (default device) +template ::value, int> = 0> +using recycle_allocator_sycl_host = + detail::aggressive_recycle_allocator>; +/// Recycling allocator for SYCL device memory (default device) +template ::value, int> = 0> +using recycle_allocator_sycl_device = + detail::recycle_allocator>; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp new file mode 100644 index 00000000..dbd7e4c8 --- /dev/null +++ b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp @@ -0,0 +1,66 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_RECYCLING_BUFFER_HPP +#define CUDA_RECYCLING_BUFFER_HPP + +// import recycle_allocator_cuda_device +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"" + +namespace cppuddle { +namespace memory_recycling { + + +/// RAII wrapper for CUDA device memory +template ::value, int> = 0> +struct cuda_device_buffer { + recycle_allocator_cuda_device allocator; + T *device_side_buffer; + size_t number_of_elements; + + cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); + device_side_buffer = + allocator.allocate(number_of_elements); + } + ~cuda_device_buffer() { + allocator.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + cuda_device_buffer(cuda_device_buffer const &other) = delete; + cuda_device_buffer operator=(cuda_device_buffer const &other) = delete; + cuda_device_buffer(cuda_device_buffer const &&other) = delete; + cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete; + +}; + +/// RAII wrapper for CUDA device memory using a passed aggregated allocator +template ::value, int> = 0> +struct cuda_aggregated_device_buffer { + T *device_side_buffer; + size_t number_of_elements; + cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } + ~cuda_aggregated_device_buffer() { + alloc.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete; + cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete; + cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete; + cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete; + +private: + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer +}; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp new file mode 100644 index 00000000..7f04e3f7 --- /dev/null +++ b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp @@ -0,0 +1,65 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_RECYCLING_BUFFER_HPP +#define HIP_RECYCLING_BUFFER_HPP + +// import recycle_allocator_hip_device +#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp" + +namespace cppuddle { +namespace memory_recycling { + +/// RAII wrapper for HIP device memory +template ::value, int> = 0> +struct hip_device_buffer { + recycle_allocator_hip_device allocator; + T *device_side_buffer; + size_t number_of_elements; + + hip_device_buffer(size_t number_of_elements, size_t device_id) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); + device_side_buffer = + allocator.allocate(number_of_elements); + } + ~hip_device_buffer() { + allocator.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + hip_device_buffer(hip_device_buffer const &other) = delete; + hip_device_buffer operator=(hip_device_buffer const &other) = delete; + hip_device_buffer(hip_device_buffer const &&other) = delete; + hip_device_buffer operator=(hip_device_buffer const &&other) = delete; + +}; + +/// RAII wrapper for CUDA device memory using a passed aggregated allocator +template ::value, int> = 0> +struct hip_aggregated_device_buffer { + T *device_side_buffer; + size_t number_of_elements; + hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } + ~hip_aggregated_device_buffer() { + alloc.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete; + hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete; + hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete; + hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete; + +private: + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer +}; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp new file mode 100644 index 00000000..b8ca526c --- /dev/null +++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp @@ -0,0 +1,171 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef RECYCLING_KOKKOS_VIEW_HPP +#define RECYCLING_KOKKOS_VIEW_HPP +#include +#include +#include + +#include "cppuddle/memory_recycling/buffer_management_interface.hpp" + + +namespace cppuddle { +namespace memory_recycling { + + +template +struct view_deleter { + alloc_type allocator; + size_t total_elements; + view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc), + total_elements(total_elements) {} + void operator()(element_type* p) { + allocator.deallocate(p, total_elements); + } +}; + +template +class aggregated_recycling_view : public kokkos_type { +private: + alloc_type allocator; + size_t total_elements{0}; + std::shared_ptr data_ref_counter; + static_assert(std::is_same_v); + +public: + using view_type = kokkos_type; + template + explicit aggregated_recycling_view(alloc_type &alloc, Args... args) + : kokkos_type( + alloc.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + allocator(alloc), + data_ref_counter(this->data(), view_deleter( + alloc, total_elements)) {} + + aggregated_recycling_view( + const aggregated_recycling_view &other) + : kokkos_type(other), allocator(other.allocator) { + data_ref_counter = other.data_ref_counter; + total_elements = other.total_elements; + } + + aggregated_recycling_view & + operator=(const aggregated_recycling_view &other) { + data_ref_counter = other.data_ref_counter; + allocator = other.allocator; + kokkos_type::operator=(other); + total_elements = other.total_elements; + return *this; + } + + aggregated_recycling_view( + aggregated_recycling_view &&other) noexcept + : kokkos_type(other), allocator(other.allocator) { + data_ref_counter = other.data_ref_counter; + total_elements = other.total_elements; + } + + aggregated_recycling_view &operator=( + aggregated_recycling_view &&other) noexcept { + data_ref_counter = other.data_ref_counter; + allocator = other.allocator; + kokkos_type::operator=(other); + total_elements = other.total_elements; + return *this; + } + + ~aggregated_recycling_view() {} +}; + + +template +class recycling_view : public kokkos_type { +private: + size_t total_elements{0}; + std::shared_ptr data_ref_counter; + +public: + using view_type = kokkos_type; + static_assert(std::is_same_v); + template = true> + recycling_view(Args... args) + : kokkos_type( + alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter( + alloc_type{}, total_elements)) {} + + template = true> + recycling_view(const size_t device_id, Args... args) + : kokkos_type( + alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter( + alloc_type{device_id}, total_elements)) {} + + template < + typename layout_t, + std::enable_if_t::value, bool> = true> + recycling_view(std::size_t device_id, layout_t layout) + : kokkos_type( + alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) / + sizeof(element_type)), + layout), + total_elements(kokkos_type::required_allocation_size(layout) / + sizeof(element_type)), + data_ref_counter(this->data(), view_deleter( + alloc_type{device_id}, total_elements)) {} + + recycling_view( + const recycling_view &other) + : kokkos_type(other) { + total_elements = other.total_elements; + data_ref_counter = other.data_ref_counter; + + } + + recycling_view & + operator=(const recycling_view &other) { + data_ref_counter = other.data_ref_counter; + kokkos_type::operator=(other); + total_elements = other.total_elements; + return *this; + } + + recycling_view( + recycling_view &&other) noexcept + : kokkos_type(other) { + data_ref_counter = other.data_ref_counter; + total_elements = other.total_elements; + } + + recycling_view &operator=( + recycling_view &&other) noexcept { + data_ref_counter = other.data_ref_counter; + kokkos_type::operator=(other); + total_elements = other.total_elements; + return *this; + } + + ~recycling_view() { } +}; + +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 55d3397a..7fbd07be 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -1,170 +1,54 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef CUDA_BUFFER_UTIL_HPP #define CUDA_BUFFER_UTIL_HPP #include "buffer_manager.hpp" -#include "detail/config.hpp" - -#include -#include -#include +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" namespace recycler { - namespace detail { +template +using cuda_pinned_allocator + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::cuda_pinned_allocator; - -template struct cuda_pinned_allocator { - using value_type = T; - cuda_pinned_allocator() noexcept = default; - template - explicit cuda_pinned_allocator(cuda_pinned_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - cudaError_t error = - cudaMallocHost(reinterpret_cast(&data), n * sizeof(T)); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_pinned_allocator failed due to cudaMallocHost failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - cudaError_t error = cudaFreeHost(p); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_pinned_allocator failed due to cudaFreeHost failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; - -template -constexpr bool operator==(cuda_pinned_allocator const &, - cuda_pinned_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(cuda_pinned_allocator const &, - cuda_pinned_allocator const &) noexcept { - return false; -} - -template struct cuda_device_allocator { - using value_type = T; - cuda_device_allocator() noexcept = default; - template - explicit cuda_device_allocator(cuda_device_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - cudaError_t error = cudaMalloc(&data, n * sizeof(T)); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_device_allocator failed due to cudaMalloc failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - cudaError_t error = cudaFree(p); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_device_allocator failed due to cudaFree failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(cuda_device_allocator const &, - cuda_device_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(cuda_device_allocator const &, - cuda_device_allocator const &) noexcept { - return false; -} - +template +using cuda_device_allocator + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::cuda_device_allocator; } // end namespace detail template ::value, int> = 0> -using recycle_allocator_cuda_host = - detail::aggressive_recycle_allocator>; -template ::value, int> = 0> -using recycle_allocator_cuda_device = - detail::recycle_allocator>; +using recycle_allocator_cuda_host + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_cuda_host; template ::value, int> = 0> -struct cuda_device_buffer { - recycle_allocator_cuda_device allocator; - T *device_side_buffer; - size_t number_of_elements; - - cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0) - : allocator{device_id}, number_of_elements(number_of_elements) { - assert(device_id < max_number_gpus); - device_side_buffer = - allocator.allocate(number_of_elements); - } - ~cuda_device_buffer() { - allocator.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - cuda_device_buffer(cuda_device_buffer const &other) = delete; - cuda_device_buffer operator=(cuda_device_buffer const &other) = delete; - cuda_device_buffer(cuda_device_buffer const &&other) = delete; - cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete; - -}; +using recycle_allocator_cuda_device + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_cuda_device; -template ::value, int> = 0> -struct cuda_aggregated_device_buffer { - T *device_side_buffer; - size_t number_of_elements; - cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) - : number_of_elements(number_of_elements), alloc(alloc) { - device_side_buffer = - alloc.allocate(number_of_elements); - } - ~cuda_aggregated_device_buffer() { - alloc.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete; - cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete; - cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete; - cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete; - -private: - Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence - // for the entire lifetime of this buffer -}; - -namespace device_selection { -template -struct select_device_functor> { - void operator()(const size_t device_id) { cudaSetDevice(device_id); } -}; -template -struct select_device_functor> { - void operator()(const size_t device_id) { cudaSetDevice(device_id); } -}; -} // namespace device_selection +template ::value, int> = 0> +using cuda_device_buffer + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::cuda_device_buffer; + +template ::value, int> = 0> +using cuda_aggregated_device_buffer + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::cuda_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index e2364095..720baf70 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -1,172 +1,53 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef HIP_BUFFER_UTIL_HPP #define HIP_BUFFER_UTIL_HPP -#include "buffer_manager.hpp" - -#include -#include -#include +#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp" namespace recycler { namespace detail { -template struct hip_pinned_allocator { - using value_type = T; - hip_pinned_allocator() noexcept = default; - template - explicit hip_pinned_allocator(hip_pinned_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - // hipError_t error = - // hipMallocHost(reinterpret_cast(&data), n * sizeof(T)); - - // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc - // (not hipMallocHost) for async memcpys - // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync - hipError_t error = - hipHostMalloc(reinterpret_cast(&data), n * sizeof(T)); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_pinned_allocator failed due to hipMallocHost failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - hipError_t error = hipHostFree(p); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_pinned_allocator failed due to hipFreeHost failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(hip_pinned_allocator const &, - hip_pinned_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(hip_pinned_allocator const &, - hip_pinned_allocator const &) noexcept { - return false; -} - -template struct hip_device_allocator { - using value_type = T; - hip_device_allocator() noexcept = default; - template - explicit hip_device_allocator(hip_device_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - hipError_t error = hipMalloc(&data, n * sizeof(T)); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_device_allocator failed due to hipMalloc failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - hipError_t error = hipFree(p); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_device_allocator failed due to hipFree failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(hip_device_allocator const &, - hip_device_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(hip_device_allocator const &, - hip_device_allocator const &) noexcept { - return false; -} +template +using hip_pinned_allocator + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::hip_pinned_allocator; +template +using hip_device_allocator + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::hip_device_allocator; } // end namespace detail template ::value, int> = 0> -using recycle_allocator_hip_host = - detail::aggressive_recycle_allocator>; -template ::value, int> = 0> -using recycle_allocator_hip_device = - detail::recycle_allocator>; +using recycle_allocator_hip_host + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_hip_host; -// TODO Is this even required? (cuda version should work fine...) template ::value, int> = 0> -struct hip_device_buffer { - recycle_allocator_hip_device allocator; - T *device_side_buffer; - size_t number_of_elements; - - hip_device_buffer(size_t number_of_elements, size_t device_id) - : allocator{device_id}, number_of_elements(number_of_elements) { - assert(device_id < max_number_gpus); - device_side_buffer = - allocator.allocate(number_of_elements); - } - ~hip_device_buffer() { - allocator.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - hip_device_buffer(hip_device_buffer const &other) = delete; - hip_device_buffer operator=(hip_device_buffer const &other) = delete; - hip_device_buffer(hip_device_buffer const &&other) = delete; - hip_device_buffer operator=(hip_device_buffer const &&other) = delete; +using recycle_allocator_hip_device + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_hip_device; -}; - -template ::value, int> = 0> -struct hip_aggregated_device_buffer { - T *device_side_buffer; - size_t number_of_elements; - hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) - : number_of_elements(number_of_elements), alloc(alloc) { - device_side_buffer = - alloc.allocate(number_of_elements); - } - ~hip_aggregated_device_buffer() { - alloc.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete; - hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete; - hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete; - hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete; - -private: - Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence - // for the entire lifetime of this buffer -}; - -namespace device_selection { -template -struct select_device_functor> { - void operator()(const size_t device_id) { hipSetDevice(device_id); } -}; -template -struct select_device_functor> { - void operator()(const size_t device_id) { hipSetDevice(device_id); } -}; -} // namespace device_selection +template ::value, int> = 0> +using hip_device_buffer + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::hip_device_buffer; + +template ::value, int> = 0> +using hip_aggregated_device_buffer + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::hip_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 2945b422..716229a0 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -1,166 +1,28 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef KOKKOS_BUFFER_UTIL_HPP #define KOKKOS_BUFFER_UTIL_HPP -#include -#include -#include -#include - -namespace recycler { +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" -template -struct view_deleter { - alloc_type allocator; - size_t total_elements; - view_deleter(alloc_type alloc, size_t total_elements) : allocator(alloc), - total_elements(total_elements) {} - void operator()(element_type* p) { - allocator.deallocate(p, total_elements); - } -}; +namespace recycler { template -class aggregated_recycled_view : public kokkos_type { -private: - alloc_type allocator; - size_t total_elements{0}; - std::shared_ptr data_ref_counter; - static_assert(std::is_same_v); - -public: - using view_type = kokkos_type; - template - explicit aggregated_recycled_view(alloc_type &alloc, Args... args) - : kokkos_type( - alloc.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - args...), - total_elements(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - allocator(alloc), - data_ref_counter(this->data(), view_deleter( - alloc, total_elements)) {} - - aggregated_recycled_view( - const aggregated_recycled_view &other) - : kokkos_type(other), allocator(other.allocator) { - data_ref_counter = other.data_ref_counter; - total_elements = other.total_elements; - } - - aggregated_recycled_view & - operator=(const aggregated_recycled_view &other) { - data_ref_counter = other.data_ref_counter; - allocator = other.allocator; - kokkos_type::operator=(other); - total_elements = other.total_elements; - return *this; - } - - aggregated_recycled_view( - aggregated_recycled_view &&other) noexcept - : kokkos_type(other), allocator(other.allocator) { - data_ref_counter = other.data_ref_counter; - total_elements = other.total_elements; - } - - aggregated_recycled_view &operator=( - aggregated_recycled_view &&other) noexcept { - data_ref_counter = other.data_ref_counter; - allocator = other.allocator; - kokkos_type::operator=(other); - total_elements = other.total_elements; - return *this; - } - - ~aggregated_recycled_view() {} -}; - +using aggregated_recycled_view [[deprecated( + "Use aggregated_recycle_view from header recycling_kokkos_view.hpp " + "instead")]] = + cppuddle::memory_recycling::aggregated_recycling_view; template -class recycled_view : public kokkos_type { -private: - size_t total_elements{0}; - std::shared_ptr data_ref_counter; - -public: - using view_type = kokkos_type; - static_assert(std::is_same_v); - template = true> - recycled_view(Args... args) - : kokkos_type( - alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - args...), - total_elements(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - data_ref_counter(this->data(), view_deleter( - alloc_type{}, total_elements)) {} - - template = true> - recycled_view(const size_t device_id, Args... args) - : kokkos_type( - alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - args...), - total_elements(kokkos_type::required_allocation_size(args...) / - sizeof(element_type)), - data_ref_counter(this->data(), view_deleter( - alloc_type{device_id}, total_elements)) {} - - template < - typename layout_t, - std::enable_if_t::value, bool> = true> - recycled_view(std::size_t device_id, layout_t layout) - : kokkos_type( - alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) / - sizeof(element_type)), - layout), - total_elements(kokkos_type::required_allocation_size(layout) / - sizeof(element_type)), - data_ref_counter(this->data(), view_deleter( - alloc_type{device_id}, total_elements)) {} - - recycled_view( - const recycled_view &other) - : kokkos_type(other) { - total_elements = other.total_elements; - data_ref_counter = other.data_ref_counter; - - } - - recycled_view & - operator=(const recycled_view &other) { - data_ref_counter = other.data_ref_counter; - kokkos_type::operator=(other); - total_elements = other.total_elements; - return *this; - } - - recycled_view( - recycled_view &&other) noexcept - : kokkos_type(other) { - data_ref_counter = other.data_ref_counter; - total_elements = other.total_elements; - } - - recycled_view &operator=( - recycled_view &&other) noexcept { - data_ref_counter = other.data_ref_counter; - kokkos_type::operator=(other); - total_elements = other.total_elements; - return *this; - } - - ~recycled_view() { } -}; - +using recycled_view [[deprecated( + "Use recycle_view from header recycling_kokkos_view.hpp instead")]] = + cppuddle::memory_recycling::recycling_view; } // end namespace recycler diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 5b0e3898..1e781442 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -1,413 +1,38 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef STREAM_MANAGER_HPP #define STREAM_MANAGER_HPP -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../include/detail/config.hpp" - -// Need to cuda/hip definitions for default params when NOT -// drawing from an executor pool -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) -#include -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) -#include -#endif -#endif - -// Redefintion required for non-recycling executors -// Without it, default constructing the executors (independent) would not work -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) -// Do only define if Kokkos is not found -#ifndef KOKKOS_ENABLE_SERIAL -namespace hpx { namespace kokkos { -enum class execution_space_mode { global, independent }; -}} -#endif -#endif - -/// Turns a std::array_mutex into an scoped lock -template -auto make_scoped_lock_from_array(mutex_array_t& mutexes) -{ - return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, - mutexes); -} - -template class round_robin_pool { -private: - std::deque pool{}; - std::vector ref_counters{}; - size_t current_interface{0}; - -public: - template - round_robin_pool(size_t number_of_streams, Ts... executor_args) { - ref_counters.reserve(number_of_streams); - for (int i = 0; i < number_of_streams; i++) { - pool.emplace_back(executor_args...); - ref_counters.emplace_back(0); - } - } - // return a tuple with the interface and its index (to release it later) - std::tuple get_interface() { - assert(!(pool.empty())); - size_t last_interface = current_interface; - current_interface = (current_interface + 1) % pool.size(); - ref_counters[last_interface]++; - std::tuple ret(pool[last_interface], last_interface); - return ret; - } - void release_interface(size_t index) { ref_counters[index]--; } - bool interface_available(size_t load_limit) { - return *(std::min_element(std::begin(ref_counters), - std::end(ref_counters))) < load_limit; - } - size_t get_current_load() { - return *( - std::min_element(std::begin(ref_counters), std::end(ref_counters))); - } - // TODO Remove - /* size_t get_next_device_id() { */ - /* return 0; // single gpu pool */ - /* } */ -}; - -template class priority_pool { -private: - std::deque pool{}; - std::vector ref_counters{}; // Ref counters - std::vector priorities{}; // Ref counters -public: - template - priority_pool(size_t number_of_streams, Ts... executor_args) { - ref_counters.reserve(number_of_streams); - priorities.reserve(number_of_streams); - for (auto i = 0; i < number_of_streams; i++) { - pool.emplace_back(executor_args...); - ref_counters.emplace_back(0); - priorities.emplace_back(i); - } - } - // return a tuple with the interface and its index (to release it later) - std::tuple get_interface() { - auto &interface = pool[priorities[0]]; - ref_counters[priorities[0]]++; - std::tuple ret(interface, priorities[0]); - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - return ret; - } - void release_interface(size_t index) { - ref_counters[index]--; - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - } - bool interface_available(size_t load_limit) { - return ref_counters[priorities[0]] < load_limit; - } - size_t get_current_load() { return ref_counters[priorities[0]]; } - // TODO remove - /* size_t get_next_device_id() { */ - /* return 0; // single gpu pool */ - /* } */ -}; - -/// Access/Concurrency Control for stream pool implementation -class stream_pool { -public: - template - static void init(size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init(number_of_streams, - executor_args...); - } - template - static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init_all_executor_pools(number_of_streams, - executor_args...); - } - template - static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init_executor_pool(pool_id, number_of_streams, - executor_args...); - } - template static void cleanup() { - stream_pool_implementation::cleanup(); - } - template - static std::tuple get_interface(const size_t gpu_id) { - return stream_pool_implementation::get_interface(gpu_id); - } - template - static void release_interface(size_t index, const size_t gpu_id) noexcept { - stream_pool_implementation::release_interface(index, - gpu_id); - } - template - static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept { - return stream_pool_implementation::interface_available( - load_limit, gpu_id); - } - template - static size_t get_current_load(const size_t gpu_id = 0) noexcept { - return stream_pool_implementation::get_current_load( - gpu_id); - } - template - static size_t get_next_device_id(const size_t number_gpus) noexcept { - // TODO add round robin and min strategy - return recycler::get_device_id(number_gpus); - } - - template - static void set_device_selector(std::function select_gpu_function) { - stream_pool_implementation::set_device_selector(select_gpu_function); - } - - template - static void select_device(size_t gpu_id) { - stream_pool_implementation::select_device(gpu_id); - } - -private: - stream_pool() = default; - -private: - template class stream_pool_implementation { - public: - /// Deprecated! Use init_on_all_gpu or init_on_gpu - template - static void init(size_t number_of_streams, Ts ... executor_args) { - /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */ - /* "deprecated stream_pool::init does not support multigpu"); */ - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - instance().streampools.emplace_back(number_of_streams, executor_args...); - assert(instance().streampools.size() <= recycler::max_number_gpus); - } - - /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments - template - static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - if (number_of_streams > 0) { - for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) { - instance().select_gpu_function(gpu_id); - instance().streampools.emplace_back(number_of_streams, - executor_args...); - } - } - assert(instance().streampools.size() <= recycler::max_number_gpus); - } - - /// Per-GPU init allowing for different init parameters depending on the GPU - /// (useful for executor that expect an GPU-id during construction) - template - static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - if (number_of_streams > 0) { - instance().select_gpu_function(gpu_id); - instance().streampools.emplace_back(number_of_streams, - executor_args...); - } - assert(instance().streampools.size() <= recycler::max_number_gpus); - } - - // TODO add/rename into finalize? - static void cleanup() { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - assert(instance().streampools.size() == recycler::max_number_gpus); - instance().streampools.clear(); - } - - static std::tuple get_interface(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].get_interface(); - } - static void release_interface(size_t index, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - instance().streampools[gpu_id].release_interface(index); - } - static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].interface_available(load_limit); - } - static size_t get_current_load(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].get_current_load(); - } - // TODO deprecated! Remove... - /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ - /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ - /* assert(instance().streampools.size() == recycler::max_number_gpus); */ - /* return instance().streampools[gpu_id].get_next_device_id(); */ - /* } */ - - static void set_device_selector(std::function select_gpu_function) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - instance().select_gpu_function = select_gpu_function; - } - - static void select_device(size_t gpu_id) { - instance().select_gpu_function(gpu_id); - } - - private: - stream_pool_implementation() = default; - recycler::mutex_t pool_mut{}; - std::function select_gpu_function = [](size_t gpu_id) { - // By default no multi gpu support - assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1); - assert(gpu_id == 0); - }; - - std::deque streampools{}; - std::array gpu_mutexes; - - static stream_pool_implementation& instance(void) { - static stream_pool_implementation pool_instance{}; - return pool_instance; - } - - public: - ~stream_pool_implementation() = default; - // Bunch of constructors we don't need - stream_pool_implementation(stream_pool_implementation const &other) = - delete; - stream_pool_implementation & - operator=(stream_pool_implementation const &other) = delete; - stream_pool_implementation(stream_pool_implementation &&other) = delete; - stream_pool_implementation & - operator=(stream_pool_implementation &&other) = delete; - }; - -public: - ~stream_pool() = default; - // Bunch of constructors we don't need - stream_pool(stream_pool const &other) = delete; - stream_pool &operator=(stream_pool const &other) = delete; - stream_pool(stream_pool &&other) = delete; - stream_pool &operator=(stream_pool &&other) = delete; -}; - -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) - -// Warn about suboptimal performance without recycling -#pragma message \ -"Warning: Building without executor recycling! Use only for performance testing! \ -For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!" - -/// Slow version of the stream_interface that does not draw its -/// executors (Interface) from the pool but creates them instead. -/// Only meant for performance comparisons and only works with cuda/kokkos executors -template class stream_interface { -public: - - template - explicit stream_interface(size_t gpu_id, - std::enable_if_t::value, size_t> = 0) - : gpu_id(gpu_id), interface(gpu_id) {} - template - explicit stream_interface(std::enable_if_t::value, size_t> = 0) - : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {} - - stream_interface(const stream_interface &other) = delete; - stream_interface &operator=(const stream_interface &other) = delete; - stream_interface(stream_interface &&other) = delete; - stream_interface &operator=(stream_interface &&other) = delete; - ~stream_interface() { - } - - template - inline decltype(auto) post(F &&f, Ts &&... ts) { - return interface.post(std::forward(f), std::forward(ts)...); - } - - template - inline decltype(auto) async_execute(F &&f, Ts &&... ts) { - return interface.async_execute(std::forward(f), std::forward(ts)...); - } - - inline decltype(auto) get_future() { - return interface.get_future(); - } - - // allow implict conversion - operator Interface &() { // NOLINT - return interface; - } - -private: - size_t gpu_id; - -public: - Interface interface; -}; -#else -/// Stream interface for RAII purposes -/// Draws executor from the stream pool and releases it upon -/// destruction -template class stream_interface { -public: - explicit stream_interface(size_t gpu_id) - : t(stream_pool::get_interface(gpu_id)), - interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} - - stream_interface(const stream_interface &other) = delete; - stream_interface &operator=(const stream_interface &other) = delete; - stream_interface(stream_interface &&other) = delete; - stream_interface &operator=(stream_interface &&other) = delete; - ~stream_interface() { - stream_pool::release_interface(interface_index, gpu_id); - } - - template - inline decltype(auto) post(F &&f, Ts &&... ts) { - return interface.post(std::forward(f), std::forward(ts)...); - } - - template - inline decltype(auto) async_execute(F &&f, Ts &&... ts) { - return interface.async_execute(std::forward(f), std::forward(ts)...); - } - - inline decltype(auto) get_future() { - return interface.get_future(); - } - - // allow implict conversion - operator Interface &() { // NOLINT - return interface; - } - -private: - std::tuple t; - size_t interface_index; - size_t gpu_id; - -public: - Interface &interface; -}; -#endif +#include "cppuddle/executor_recycling/executor_pools_interface.hpp" + +template +using round_robin_pool + [[deprecated("Use cppuddle::executor_recycling::round_robin_pool_impl from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::round_robin_pool_impl; + +template +using priority_pool + [[deprecated("Use cppuddle::executor_recycling::priority_pool_impl from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::priority_pool_impl; + +using stream_pool + [[deprecated("Use cppuddle::executor_recycling::executor_pool from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::executor_pool; + +template +using stream_interface + [[deprecated("Use cppuddle::executor_recycling::executor_interface from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::executor_interface; #endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 61d22f8f..4bf45b3f 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -1,83 +1,42 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef SYCL_BUFFER_UTIL_HPP #define SYCL_BUFFER_UTIL_HPP -#include "buffer_manager.hpp" - -#include -#include -#include +#include "cppuddle/memory_recycling/sycl_recycling_allocators.hpp" namespace recycler { namespace detail { -static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); +template +using sycl_host_default_allocator + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::sycl_host_default_allocator; -template struct sycl_host_default_allocator { - using value_type = T; - sycl_host_default_allocator() noexcept = default; - template - explicit sycl_host_default_allocator(sycl_host_default_allocator const &) noexcept {} - T *allocate(std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - T *data = cl::sycl::malloc_host(n, default_queue); - return data; - } - void deallocate(T *p, std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - cl::sycl::free(p, default_queue); - } -}; -template -constexpr bool operator==(sycl_host_default_allocator const &, - sycl_host_default_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(sycl_host_default_allocator const &, - sycl_host_default_allocator const &) noexcept { - return false; -} - -template struct sycl_device_default_allocator { - using value_type = T; - sycl_device_default_allocator() noexcept = default; - template - explicit sycl_device_default_allocator(sycl_device_default_allocator const &) noexcept {} - T *allocate(std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - T *data = cl::sycl::malloc_device(n, default_queue); - return data; - } - void deallocate(T *p, std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - cl::sycl::free(p, default_queue); - } -}; -template -constexpr bool operator==(sycl_device_default_allocator const &, - sycl_device_default_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(sycl_device_default_allocator const &, - sycl_device_default_allocator const &) noexcept { - return false; -} +template +using sycl_device_default_allocator + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::detail::sycl_device_default_allocator; } // end namespace detail template ::value, int> = 0> -using recycle_allocator_sycl_host = - detail::aggressive_recycle_allocator>; +using recycle_allocator_sycl_host + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_sycl_host; + template ::value, int> = 0> -using recycle_allocator_sycl_device = - detail::recycle_allocator>; +using recycle_allocator_sycl_device + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::memory_recycling::recycle_allocator_sycl_device; } // end namespace recycler #endif diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index c3c09217..ea9ce9a4 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -3,8 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/buffer_manager.hpp" -#include "../include/aligned_buffer_util.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -18,6 +16,8 @@ #include #include +#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp" + #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { #else @@ -79,7 +79,9 @@ int main(int argc, char *argv[]) { << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> + std::vector< + double, + cppuddle::memory_recycling::aggressive_recycle_aligned> test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += @@ -92,8 +94,8 @@ int main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Recycle Test: @@ -101,8 +103,9 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( - array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += std::chrono::duration_cast(end - begin) @@ -114,8 +117,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Same test using std::allocator: @@ -124,7 +127,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); std::vector> - test2(array_size, double{}); + test2(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); default_duration += std::chrono::duration_cast(end - begin) @@ -146,7 +149,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - recycler::print_performance_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 9d8cc44b..21c4baed 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -15,7 +15,7 @@ #include -#include "../include/buffer_manager.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" int hpx_main(int argc, char *argv[]) { @@ -112,8 +112,8 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6(array_size, - double{}); + std::vector> + test6(array_size, double{}); }); } } @@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // ensure that at least 4 buffers have to created for unit testing { - std::vector> buffer1( + std::vector> buffer1( array_size, double{}); - std::vector> buffer2( + std::vector> buffer2( array_size, double{}); - std::vector> buffer3( + std::vector> buffer3( array_size, double{}); - std::vector> buffer4( + std::vector> buffer4( array_size, double{}); } @@ -153,8 +153,10 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); + std::vector< + double, + cppuddle::memory_recycling::aggressive_recycle_std> + test6(array_size, double{}); }); } } @@ -167,8 +169,8 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index 7708fe56..ad184ff5 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -18,12 +18,13 @@ #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" -#include "../include/kokkos_buffer_util.hpp" #include #include +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" + // Assert during Release builds as well for this file: #undef NDEBUG #include // reinclude the header to update the definition of assert() @@ -36,18 +37,17 @@ template using kokkos_um_array = Kokkos::View; template -using recycled_host_view = - recycler::recycled_view, recycler::recycle_std, T>; - +using recycle_host_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_array, cppuddle::memory_recycling::recycle_std, T>; // Device views using recycle allocators template using kokkos_um_device_array = Kokkos::View; template -using recycled_device_view = - recycler::recycled_view, - recycler::recycle_allocator_cuda_device, T>; +using recycle_device_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_device_array, + cppuddle::memory_recycling::recycle_allocator_cuda_device, T>; // Host views using pinned memory recycle allocators template @@ -55,9 +55,9 @@ using kokkos_um_pinned_array = Kokkos::View::array_layout, Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>; template -using recycled_pinned_view = - recycler::recycled_view, - recycler::recycle_allocator_cuda_host, T>; +using recycle_pinned_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_pinned_array, + cppuddle::memory_recycling::recycle_allocator_cuda_host, T>; template auto get_iteration_policy(const Executor &&executor, @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { // Host run for (size_t pass = 0; pass < passes; pass++) { // Create view - recycled_host_view hostView(view_size_0, view_size_1); + recycle_host_view hostView(view_size_0, view_size_1); // Create executor hpx::kokkos::serial_executor executor; @@ -109,7 +109,7 @@ int main(int argc, char *argv[]) { // Device run for (size_t pass = 0; pass < passes; pass++) { // Create and init host view - recycled_pinned_view hostView(view_size_0, view_size_1); + recycle_pinned_view hostView(view_size_0, view_size_1); for(size_t i = 0; i < view_size_0; i++) { for(size_t j = 0; j < view_size_1; j++) { hostView(i, j) = 1.0; @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) { hpx::kokkos::cuda_executor executor(hpx::kokkos::execution_space_mode::independent); // Use executor to move the host data to the device - recycled_device_view deviceView(view_size_0, view_size_1); + recycle_device_view deviceView(view_size_0, view_size_1); Kokkos::deep_copy(executor.instance(), deviceView, hostView); auto policy_1 = Kokkos::Experimental::require( @@ -143,11 +143,11 @@ int main(int argc, char *argv[]) { // otherwise the HPX cuda polling futures won't work hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); - recycler::print_performance_counters(); + cppuddle::memory_recycling::print_buffer_counters(); // Cleanup all cuda views // (otherwise the cuda driver might shut down before this gets done automatically at // the end of the programm) - recycler::force_cleanup(); + cppuddle::memory_recycling::force_buffer_cleanup(); return hpx::finalize(); } diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index e2770458..e231b557 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -13,9 +13,6 @@ #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" -#include "../include/kokkos_buffer_util.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -24,6 +21,10 @@ #include #include +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" + using kokkos_array = Kokkos::View; @@ -32,8 +33,8 @@ template using kokkos_um_array = Kokkos::View; template -using recycled_host_view = - recycler::recycled_view, recycler::recycle_std, T>; +using recycle_host_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_array, cppuddle::memory_recycling::recycle_std, T>; #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { @@ -74,8 +75,8 @@ int main(int argc, char *argv[]) { hpx::kokkos::ScopeGuard scopeGuard(argc, argv); Kokkos::print_configuration(std::cout); - using test_view = recycled_host_view; - using test_double_view = recycled_host_view; + using test_view = recycle_host_view; + using test_double_view = recycle_host_view; constexpr size_t passes = 100; for (size_t pass = 0; pass < passes; pass++) { @@ -91,7 +92,7 @@ int main(int argc, char *argv[]) { }); Kokkos::fence(); } - recycler::print_performance_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 004368a4..9a44664f 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/buffer_manager.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -17,6 +16,8 @@ #include #include +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" + #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { #else @@ -76,8 +77,9 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( - array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += std::chrono::duration_cast(end - begin) @@ -88,16 +90,17 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for + // better comparison // Recycle Test: { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1(array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += std::chrono::duration_cast(end - begin) @@ -108,9 +111,9 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for + // better comparison // Same test using std::allocator: { @@ -138,7 +141,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - recycler::print_performance_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp index 96599759..2e3ebf4c 100644 --- a/tests/stream_test.cpp +++ b/tests/stream_test.cpp @@ -1,10 +1,9 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #define USE_HPX_MAIN -#include "../include/stream_manager.hpp" #include #ifdef USE_HPX_MAIN #include @@ -26,46 +25,49 @@ int main(int argc, char *argv[]) { #endif std::cout << "Starting ref counting tests ..." << std::endl; test_pool_ref_counting>( - 2, 0, false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - round_robin_pool>(2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, + false); + test_pool_ref_counting>(2, 0, + false); std::cout << "Finished ref counting tests!" << std::endl; - std::cout << "Starting wrapper objects tests ..." << std::endl; test_pool_wrappers>( - 2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); test_pool_wrappers>( - 2, 0, false); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); std::cout << "Finished wrapper objects tests!" << std::endl; std::cout << "Starting memcpy tests... " << std::endl; test_pool_memcpy>( - 2, 0, false); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); test_pool_memcpy>( - 2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); std::cout << "Finished memcpy tests! " << std::endl; std::cout << "Starting memcpy polling tests... " << std::endl; { // hpx::cuda::experimental::enable_user_polling polling_scope; - hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0)); + hpx::cuda::experimental::detail::register_polling( + hpx::resource::get_thread_pool(0)); test_pool_memcpy>( - 2, 0, true); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, true); test_pool_memcpy>( - 2, 0, true); - hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, true); + hpx::cuda::experimental::detail::unregister_polling( + hpx::resource::get_thread_pool(0)); } - recycler::force_cleanup(); + cppuddle::memory_recycling::force_buffer_cleanup(); std::cout << "Finished memcpy tests! " << std::endl; return hpx::finalize(); } diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index 07de4c44..63f25b27 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -9,123 +9,182 @@ #include #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" template -void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) { - std::vector> hostbuffer( - 512); - recycler::cuda_device_buffer devicebuffer(512); - stream_pool::init(stream_parameter, std::forward(ts)...); +void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) { + std::vector> + hostbuffer(512); + cppuddle::memory_recycling::cuda_device_buffer devicebuffer(512); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); // without interface wrapper { - auto test1 = stream_pool::get_interface(0); + auto test1 = + cppuddle::executor_recycling::executor_pool::get_interface(0); Interface test1_interface = std::get<0>(test1); size_t interface_id = std::get<1>(test1); - hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, - hostbuffer.data(), 512 * sizeof(double), - cudaMemcpyHostToDevice); - auto fut1 = hpx::async(test1_interface, - cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer, - 512 * sizeof(double), cudaMemcpyDeviceToHost); + hpx::apply(test1_interface, cudaMemcpyAsync, + devicebuffer.device_side_buffer, hostbuffer.data(), + 512 * sizeof(double), cudaMemcpyHostToDevice); + auto fut1 = hpx::async(test1_interface, cudaMemcpyAsync, hostbuffer.data(), + devicebuffer.device_side_buffer, + 512 * sizeof(double), cudaMemcpyDeviceToHost); fut1.get(); - stream_pool::release_interface(interface_id, 0); + cppuddle::executor_recycling::executor_pool::release_interface( + interface_id, 0); } // with interface wrapper { - stream_interface test1_interface{0}; + cppuddle::executor_recycling::executor_interface + test1_interface{0}; // hpx::cuda::cuda_executor test1_interface(0, false); - hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, - hostbuffer.data(), 512 * sizeof(double), - cudaMemcpyHostToDevice); - auto fut1 = hpx::async(test1_interface.interface, - cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer, - 512 * sizeof(double), cudaMemcpyDeviceToHost); + hpx::apply(test1_interface.interface, cudaMemcpyAsync, + devicebuffer.device_side_buffer, hostbuffer.data(), + 512 * sizeof(double), cudaMemcpyHostToDevice); + auto fut1 = hpx::async(test1_interface.interface, cudaMemcpyAsync, + hostbuffer.data(), devicebuffer.device_side_buffer, + 512 * sizeof(double), cudaMemcpyDeviceToHost); fut1.get(); } - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } template -void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) { +void test_pool_ref_counting(const size_t executor_parameter, Ts &&...ts) { // init ppol - stream_pool::init(stream_parameter, std::forward(ts)...); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); { // Allocating - auto test1 = stream_pool::get_interface(0); - auto load1 = stream_pool::get_current_load(0); + auto test1 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load1 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load1 == 0); Interface test1_interface = std::get<0>(test1); size_t test1_index = std::get<1>(test1); - auto test2 = stream_pool::get_interface(0); - auto load2 = stream_pool::get_current_load(0); + auto test2 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load2 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load2 == 1); Interface test2_interface = std::get<0>(test2); // auto fut = test2_interface.get_future(); size_t test2_index = std::get<1>(test2); - auto test3 = stream_pool::get_interface(0); - auto load3 = stream_pool::get_current_load(0); + auto test3 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load3 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load3 == 1); Interface test3_interface = std::get<0>(test3); size_t test3_index = std::get<1>(test3); - auto test4 = stream_pool::get_interface(0); - auto load4 = stream_pool::get_current_load(0); + auto test4 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load4 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); Interface test4_interface = std::get<0>(test4); size_t test4_index = std::get<1>(test4); assert(load4 == 2); // Releasing - stream_pool::release_interface(test4_index, 0); - load4 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test4_index, 0); + load4 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load4 == 1); - stream_pool::release_interface(test3_index, 0); - load3 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test3_index, 0); + load3 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load3 == 1); - stream_pool::release_interface(test2_index, 0); - load2 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test2_index, 0); + load2 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load2 == 0); - stream_pool::release_interface(test1_index, 0); - load1 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test1_index, 0); + load1 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load1 == 0); } // Clear - auto load0 = stream_pool::get_current_load(0); + auto load0 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load0 == 0); - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } template -void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) { - using wrapper_type = stream_interface; +void test_pool_wrappers(const size_t executor_parameter, Ts &&...ts) { + using wrapper_type = + cppuddle::executor_recycling::executor_interface; // init ppol - stream_pool::init(stream_parameter, std::forward(ts)...); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); { wrapper_type test1{0}; - auto load = stream_pool::get_current_load(0); + auto load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 0); wrapper_type test2{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); // auto fut = test2.get_future(); assert(load == 1); wrapper_type test3{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 1); wrapper_type test4{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 2); // Check availability method: - bool avail = stream_pool::interface_available(1, 0); + bool avail = + cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(1, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available(2, 0); + avail = cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(2, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available(3, 0); + avail = cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(3, 0); assert(avail == true); // NOLINT } - auto load0 = stream_pool::get_current_load(0); + auto load0 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load0 == 0); - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } #endif diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp index fed34626..d65c9668 100644 --- a/tests/work_aggregation_cpu_triad.cpp +++ b/tests/work_aggregation_cpu_triad.cpp @@ -5,9 +5,8 @@ #include #undef NDEBUG - -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" #include @@ -101,7 +100,8 @@ int hpx_main(int argc, char *argv[]) { size_t number_underlying_executors{0}; bool print_launch_counter{false}; std::string executor_type_string{}; - Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{ + cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::string filename{}; { try { @@ -161,11 +161,11 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } if (executor_type_string == "EAGER") { - executor_mode = Aggregated_Executor_Modes::EAGER; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER; } else if (executor_type_string == "STRICT") { - executor_mode = Aggregated_Executor_Modes::STRICT; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT; } else if (executor_type_string == "ENDLESS") { - executor_mode = Aggregated_Executor_Modes::ENDLESS; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS; } else { std::cerr << "ERROR: Unknown executor mode " << executor_type_string << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl; @@ -180,11 +180,14 @@ int hpx_main(int argc, char *argv[]) { } } - stream_pool::init>( + cppuddle::executor_recycling::executor_pool::init< + Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>( number_underlying_executors); static const char kernelname[] = "cpu_triad"; - using executor_pool = aggregation_pool>; + using executor_pool = cppuddle::kernel_aggregation::aggregation_pool< + kernelname, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); using float_t = float; @@ -290,7 +293,7 @@ int hpx_main(int argc, char *argv[]) { std::flush(hpx::cout); sleep(1); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index f3f6ec92..75f7ad14 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -7,24 +7,25 @@ //#undef NDEBUG #include -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" - #include - +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" //=============================================================================== //=============================================================================== // Stream benchmark template -__global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_t *B, const float_t *C, const float_t scalar, const size_t start_id, const size_t kernel_size, const size_t problem_size) { +__global__ void __launch_bounds__(1024, 2) + triad_kernel(float_t *A, const float_t *B, const float_t *C, + const float_t scalar, const size_t start_id, + const size_t kernel_size, const size_t problem_size) { const size_t i = start_id + blockIdx.x * blockDim.x + threadIdx.x; A[i] = B[i] + scalar * C[i]; } - //=============================================================================== //=============================================================================== int hpx_main(int argc, char *argv[]) { @@ -37,7 +38,8 @@ int hpx_main(int argc, char *argv[]) { size_t number_underlying_executors{0}; bool print_launch_counter{false}; std::string executor_type_string{}; - Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{ + cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::string filename{}; { try { @@ -97,11 +99,11 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } if (executor_type_string == "EAGER") { - executor_mode = Aggregated_Executor_Modes::EAGER; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER; } else if (executor_type_string == "STRICT") { - executor_mode = Aggregated_Executor_Modes::STRICT; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT; } else if (executor_type_string == "ENDLESS") { - executor_mode = Aggregated_Executor_Modes::ENDLESS; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS; } else { std::cerr << "ERROR: Unknown executor mode " << executor_type_string << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl; @@ -119,11 +121,14 @@ int hpx_main(int argc, char *argv[]) { hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0)); using executor_t = hpx::cuda::experimental::cuda_executor; - stream_pool::init>( + cppuddle::executor_recycling::executor_pool::init< + executor_t, + cppuddle::executor_recycling::round_robin_pool_impl>( number_underlying_executors, 0, true); static const char kernelname2[] = "cuda_triad"; - using executor_pool = aggregation_pool>; + using executor_pool = cppuddle::kernel_aggregation::aggregation_pool< + kernelname2, executor_t, + cppuddle::executor_recycling::round_robin_pool_impl>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); using float_t = float; @@ -147,9 +152,9 @@ int hpx_main(int argc, char *argv[]) { std::vector A(problem_size, 0.0); std::vector B(problem_size, 2.0); std::vector C(problem_size, 1.0); - recycler::cuda_device_buffer device_A(problem_size, 0); - recycler::cuda_device_buffer device_B(problem_size, 0); - recycler::cuda_device_buffer device_C(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_A(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_B(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_C(problem_size, 0); cudaMemcpy(device_A.device_side_buffer, A.data(), problem_size * sizeof(float_t), cudaMemcpyHostToDevice); cudaMemcpy(device_B.device_side_buffer, B.data(), @@ -196,17 +201,16 @@ int hpx_main(int argc, char *argv[]) { auto slice_exec = fut.get(); auto alloc_host = slice_exec.template make_allocator< - float_t, recycler::detail::cuda_pinned_allocator>(); + float_t, cppuddle::memory_recycling::detail::cuda_pinned_allocator>(); auto alloc_device = slice_exec.template make_allocator< - float_t, recycler::detail::cuda_device_allocator>(); + float_t, cppuddle::memory_recycling::detail::cuda_device_allocator>(); // Start the actual task - // todo -- one slice gets a buffer that's not vaild anymore std::vector local_A( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_A(slice_exec.number_slices * kernel_size, alloc_device); @@ -214,7 +218,7 @@ int hpx_main(int argc, char *argv[]) { std::vector local_B( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_B(slice_exec.number_slices * kernel_size, alloc_device); @@ -222,7 +226,7 @@ int hpx_main(int argc, char *argv[]) { std::vector local_C( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_C(slice_exec.number_slices * kernel_size, alloc_device); @@ -317,9 +321,9 @@ int hpx_main(int argc, char *argv[]) { std::vector A(problem_size, 0.0); std::vector B(problem_size, 2.0); std::vector C(problem_size, 1.0); - recycler::cuda_device_buffer device_A(problem_size, 0); - recycler::cuda_device_buffer device_B(problem_size, 0); - recycler::cuda_device_buffer device_C(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_A(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_B(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_C(problem_size, 0); cudaMemcpy(device_A.device_side_buffer, A.data(), problem_size * sizeof(float_t), cudaMemcpyHostToDevice); cudaMemcpy(device_B.device_side_buffer, B.data(), @@ -417,7 +421,7 @@ int hpx_main(int argc, char *argv[]) { /* sleep(1); */ hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp index 25455633..abe827f4 100644 --- a/tests/work_aggregation_test.cpp +++ b/tests/work_aggregation_test.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022-2022 Gregor Daiß +// Copyright (c) 2022-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -9,11 +9,16 @@ #include #include #include -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" #include +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" +#define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are + // compatible across all participating tasks + // Must be defined before including the aggregation: +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" //=============================================================================== //=============================================================================== @@ -114,9 +119,11 @@ namespace hpx { namespace parallel { namespace execution { void sequential_test(void) { static const char kernelname[] = "kernel1"; - using kernel_pool1 = aggregation_pool>; - kernel_pool1::init(8, 2, Aggregated_Executor_Modes::STRICT); + using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool< + kernelname, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; + kernel_pool1::init( + 8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); // Sequential test hpx::cout << "Sequential test with all executor slices" << std::endl; hpx::cout << "----------------------------------------" << std::endl; @@ -260,8 +267,8 @@ void interruption_test(void) { hpx::cout << "Sequential test with interruption:" << std::endl; hpx::cout << "----------------------------------" << std::endl; { - Aggregated_Executor agg_exec{ - 4, Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor agg_exec{ + 4, cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::vector> slices_done_futs; auto slice_fut1 = agg_exec.request_executor_slice(); @@ -326,8 +333,8 @@ void failure_test(bool type_error) { hpx::cout << "------------------------------------------------------" << std::endl; { - Aggregated_Executor agg_exec{ - 4, Aggregated_Executor_Modes::STRICT}; + cppuddle::kernel_aggregation::aggregated_executor agg_exec{ + 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT}; auto slice_fut1 = agg_exec.request_executor_slice(); @@ -405,9 +412,11 @@ void pointer_add_test(void) { hpx::cout << "--------------------------------------------------------" << std::endl; static const char kernelname2[] = "kernel2"; - using kernel_pool2 = aggregation_pool>; - kernel_pool2::init(8, 2, Aggregated_Executor_Modes::STRICT); + using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool< + kernelname2, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; + kernel_pool2::init( + 8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); { std::vector erg(512); std::vector> slices_done_futs; @@ -602,10 +611,12 @@ void references_add_test(void) { { /*Aggregated_Executor agg_exec{ 4, Aggregated_Executor_Modes::STRICT};*/ - auto &agg_exec = - std::get<0>(stream_pool::get_interface< - Aggregated_Executor, - round_robin_pool>>(0)); + auto &agg_exec = std::get<0>( + cppuddle::executor_recycling::executor_pool::get_interface< + cppuddle::kernel_aggregation::aggregated_executor, + cppuddle::executor_recycling::round_robin_pool_impl< + cppuddle::kernel_aggregation::aggregated_executor< + Dummy_Executor>>>(0)); std::vector erg(512); std::vector> slices_done_futs; @@ -826,14 +837,19 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } - stream_pool::init>( - 8, 0, false); - stream_pool::init>(8); - - stream_pool::init, - round_robin_pool>>( - 8, 4, Aggregated_Executor_Modes::STRICT); + cppuddle::executor_recycling::executor_pool::init< + hpx::cuda::experimental::cuda_executor, + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(8, 0, false); + cppuddle::executor_recycling::executor_pool::init< + Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>(8); + + cppuddle::executor_recycling::executor_pool::init< + cppuddle::kernel_aggregation::aggregated_executor, + cppuddle::executor_recycling::round_robin_pool_impl< + cppuddle::kernel_aggregation::aggregated_executor>>( + 8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); /*hpx::cuda::experimental::cuda_executor executor1 = std::get<0>(stream_pool::get_interface< hpx::cuda::experimental::cuda_executor, @@ -863,8 +879,8 @@ int hpx_main(int argc, char *argv[]) { std::flush(hpx::cout); sleep(1); - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); }