From 875d93b2d0a324228f80c2c0b74bf5bddb4e3a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 6 Mar 2024 10:27:16 +0100 Subject: [PATCH 01/19] Rework allocator namespace 1 --- include/aggregation_manager.hpp | 6 +- ...l.hpp => aligned_recycling_allocators.hpp} | 10 +-- ...util.hpp => cuda_recycling_allocators.hpp} | 39 +++++------- .../buffer_recycler.hpp} | 63 +++++++++---------- include/detail/config.hpp | 4 +- ..._util.hpp => hip_recycling_allocators.hpp} | 35 ++++++----- ...fer_util.hpp => recycling_kokkos_view.hpp} | 54 ++++++++-------- ...util.hpp => sycl_recycling_allocators.hpp} | 12 ++-- tests/allocator_aligned_test.cpp | 18 +++--- tests/allocator_hpx_test.cpp | 22 +++---- tests/allocator_test.cpp | 16 ++--- 11 files changed, 137 insertions(+), 142 deletions(-) rename include/{aligned_buffer_util.hpp => aligned_recycling_allocators.hpp} (85%) rename include/{cuda_buffer_util.hpp => cuda_recycling_allocators.hpp} (96%) rename include/{buffer_manager.hpp => detail/buffer_recycler.hpp} (95%) rename include/{hip_buffer_util.hpp => hip_recycling_allocators.hpp} (96%) rename include/{kokkos_buffer_util.hpp => recycling_kokkos_view.hpp} (76%) rename include/{sycl_buffer_util.hpp => sycl_recycling_allocators.hpp} (93%) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 2aa92063..cd1ca74b 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -592,9 +592,9 @@ template class Aggregated_Executor { #endif // Get shiny and new buffer that will be shared between all slices // Buffer might be recycled from previous allocations by the - // buffer_recycler... + // buffer_interface... T *aggregated_buffer = - recycler::detail::buffer_recycler::get( + recycler::detail::buffer_interface::get( size, manage_content_lifetime, location_id, gpu_id); // Create buffer entry for this buffer buffer_allocations.emplace_back(static_cast(aggregated_buffer), @@ -670,7 +670,7 @@ template class Aggregated_Executor { // it as invalid) if (valid) { assert(buffers_in_use == true); - recycler::detail::buffer_recycler::mark_unused( + recycler::detail::buffer_interface::mark_unused( buffer_pointer, buffer_size, location_id, gpu_id); // mark buffer as invalid to prevent any other slice from marking the // buffer as unused diff --git a/include/aligned_buffer_util.hpp b/include/aligned_recycling_allocators.hpp similarity index 85% rename from include/aligned_buffer_util.hpp rename to include/aligned_recycling_allocators.hpp index d36a994a..b1ed5dce 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_recycling_allocators.hpp @@ -3,13 +3,13 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef ALIGNED_BUFFER_UTIL_HPP -#define ALIGNED_BUFFER_UTIL_HPP +#ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP +#define ALIGNED_RECYCLING_ALLOCATORS_HPP -#include "buffer_manager.hpp" +#include "detail/buffer_recycler.hpp" #include -namespace recycler { +namespace cppuddle { namespace device_selection { template struct select_device_functor< @@ -26,6 +26,6 @@ template ::value, int> = 0> using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< T, boost::alignment::aligned_allocator>; -} // namespace recycler +} // namespace cppuddle #endif diff --git a/include/cuda_buffer_util.hpp b/include/cuda_recycling_allocators.hpp similarity index 96% rename from include/cuda_buffer_util.hpp rename to include/cuda_recycling_allocators.hpp index 55d3397a..2452a563 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_recycling_allocators.hpp @@ -1,24 +1,21 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef CUDA_BUFFER_UTIL_HPP -#define CUDA_BUFFER_UTIL_HPP +#ifndef CUDA_RECYCLING_ALLOCATORS_HPP +#define CUDA_RECYCLING_ALLOCATORS_HPP -#include "buffer_manager.hpp" +#include "detail/buffer_recycler.hpp" #include "detail/config.hpp" #include #include #include -namespace recycler { - +namespace cppuddle { namespace detail { - - template struct cuda_pinned_allocator { using value_type = T; cuda_pinned_allocator() noexcept = default; @@ -98,9 +95,19 @@ constexpr bool operator!=(cuda_device_allocator const &, cuda_device_allocator const &) noexcept { return false; } +} // end namespace detail -} // end namespace detail +namespace device_selection { +template +struct select_device_functor> { + void operator()(const size_t device_id) { cudaSetDevice(device_id); } +}; +template +struct select_device_functor> { + void operator()(const size_t device_id) { cudaSetDevice(device_id); } +}; +} // namespace device_selection template ::value, int> = 0> using recycle_allocator_cuda_host = @@ -154,17 +161,5 @@ struct cuda_aggregated_device_buffer { Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence // for the entire lifetime of this buffer }; - -namespace device_selection { -template -struct select_device_functor> { - void operator()(const size_t device_id) { cudaSetDevice(device_id); } -}; -template -struct select_device_functor> { - void operator()(const size_t device_id) { cudaSetDevice(device_id); } -}; -} // namespace device_selection - -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/buffer_manager.hpp b/include/detail/buffer_recycler.hpp similarity index 95% rename from include/buffer_manager.hpp rename to include/detail/buffer_recycler.hpp index 92a5f46b..3ad739d8 100644 --- a/include/buffer_manager.hpp +++ b/include/detail/buffer_recycler.hpp @@ -42,9 +42,9 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #endif #endif -#include "../include/detail/config.hpp" +#include "config.hpp" -namespace recycler { +namespace cppuddle { namespace device_selection { template struct select_device_functor { @@ -63,7 +63,7 @@ template struct select_device_functor> { namespace detail { -class buffer_recycler { +class buffer_interface { public: #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) @@ -172,8 +172,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON private: /// Singleton instance access - static buffer_recycler& instance() { - static buffer_recycler singleton{}; + static buffer_interface& instance() { + static buffer_interface singleton{}; return singleton; } /// Callbacks for printing the performance counter data @@ -189,7 +189,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON std::list> partial_cleanup_callbacks; /// default, private constructor - not automatically constructed due to the /// deleted constructors - buffer_recycler() = default; + buffer_interface() = default; mutex_t callback_protection_mut; /// Add a callback function that gets executed upon cleanup and destruction @@ -217,7 +217,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON } public: - ~buffer_recycler() = default; + ~buffer_interface() = default; // Subclasses private: @@ -408,7 +408,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON // No unused buffer found -> Create new one and return it try { - recycler::device_selection::select_device_functor{}( + cppuddle::device_selection::select_device_functor{}( device_id); Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); @@ -428,13 +428,13 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON std::cerr << "Not enough memory left. Cleaning up unused buffers now..." << std::endl; - buffer_recycler::clean_unused_buffers(); + buffer_interface::clean_unused_buffers(); std::cerr << "Buffers cleaned! Try allocation again..." << std::endl; // If there still isn't enough memory left, the caller has to handle it // We've done all we can in here Host_Allocator alloc; - recycler::device_selection::select_device_functor{}( + cppuddle::device_selection::select_device_functor{}( device_id); T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( @@ -649,13 +649,13 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON std::call_once(flag, []() { #endif is_finalized = false; - buffer_recycler::add_total_cleanup_callback(clean); - buffer_recycler::add_partial_cleanup_callback( + buffer_interface::add_total_cleanup_callback(clean); + buffer_interface::add_partial_cleanup_callback( clean_unused_buffers_only); - buffer_recycler::add_finalize_callback( + buffer_interface::add_finalize_callback( finalize); #ifdef CPPUDDLE_HAVE_COUNTERS - buffer_recycler::add_print_callback( + buffer_interface::add_print_callback( print_performance_counters); #endif }); @@ -753,10 +753,10 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON public: // Putting deleted constructors in public gives more useful error messages // Bunch of constructors we don't need - buffer_recycler(buffer_recycler const &other) = delete; - buffer_recycler& operator=(buffer_recycler const &other) = delete; - buffer_recycler(buffer_recycler &&other) = delete; - buffer_recycler& operator=(buffer_recycler &&other) = delete; + buffer_interface(buffer_interface const &other) = delete; + buffer_interface& operator=(buffer_interface const &other) = delete; + buffer_interface(buffer_interface &&other) = delete; + buffer_interface& operator=(buffer_interface &&other) = delete; }; template struct recycle_allocator { @@ -775,11 +775,11 @@ template struct recycle_allocator { recycle_allocator const &other) noexcept : dealloc_hint(std::nullopt), device_id(std::nullopt) {} T *allocate(std::size_t n) { - T *data = buffer_recycler::get(n); + T *data = buffer_interface::get(n); return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); + buffer_interface::mark_unused(p, n); } #else recycle_allocator() noexcept @@ -792,12 +792,12 @@ template struct recycle_allocator { recycle_allocator const &other) noexcept : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} T *allocate(std::size_t n) { - T *data = buffer_recycler::get( + T *data = buffer_interface::get( n, false, hpx::get_worker_thread_num() % number_instances, device_id); return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint, + buffer_interface::mark_unused(p, n, dealloc_hint, device_id); } #endif @@ -845,12 +845,12 @@ struct aggressive_recycle_allocator { aggressive_recycle_allocator const &) noexcept : dealloc_hint(std::nullopt), device_id(std::nullopt) {} T *allocate(std::size_t n) { - T *data = buffer_recycler::get( + T *data = buffer_interface::get( n, true); // also initializes the buffer if it isn't reused return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n); + buffer_interface::mark_unused(p, n); } #else aggressive_recycle_allocator() noexcept @@ -863,13 +863,13 @@ struct aggressive_recycle_allocator { recycle_allocator const &other) noexcept : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {} T *allocate(std::size_t n) { - T *data = buffer_recycler::get( + T *data = buffer_interface::get( n, true, dealloc_hint, device_id); // also initializes the buffer // if it isn't reused return data; } void deallocate(T *p, std::size_t n) { - buffer_recycler::mark_unused(p, n, dealloc_hint, + buffer_interface::mark_unused(p, n, dealloc_hint, device_id); } #endif @@ -914,7 +914,6 @@ operator!=(aggressive_recycle_allocator const &, else return true; } - } // namespace detail template ::value, int> = 0> @@ -923,16 +922,16 @@ template ::value, int> = 0> using aggressive_recycle_std = detail::aggressive_recycle_allocator>; -inline void print_performance_counters() { detail::buffer_recycler::print_performance_counters(); } +inline void print_performance_counters() { detail::buffer_interface::print_performance_counters(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself -inline void force_cleanup() { detail::buffer_recycler::clean_all(); } +inline void force_cleanup() { detail::buffer_interface::clean_all(); } /// Deletes all buffers currently marked as unused -inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); } +inline void cleanup() { detail::buffer_interface::clean_unused_buffers(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself. Disallows further usage. -inline void finalize() { detail::buffer_recycler::finalize(); } +inline void finalize() { detail::buffer_interface::finalize(); } -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/detail/config.hpp b/include/detail/config.hpp index 2a06b1af..7115c790 100644 --- a/include/detail/config.hpp +++ b/include/detail/config.hpp @@ -28,7 +28,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #endif #endif -namespace recycler { +namespace cppuddle { #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX) using mutex_t = hpx::spinlock_no_backoff; @@ -67,6 +67,6 @@ inline size_t get_device_id(const size_t number_gpus) { #endif } -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/hip_buffer_util.hpp b/include/hip_recycling_allocators.hpp similarity index 96% rename from include/hip_buffer_util.hpp rename to include/hip_recycling_allocators.hpp index e2364095..465bd5fe 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_recycling_allocators.hpp @@ -3,16 +3,17 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef HIP_BUFFER_UTIL_HPP -#define HIP_BUFFER_UTIL_HPP +#ifndef HIP_RECYCLING_ALLOCATORS_HPP +#define HIP_RECYCLING_ALLOCATORS_HPP -#include "buffer_manager.hpp" +#include "detail/buffer_recycler.hpp" +#include "detail/config.hpp" #include #include #include -namespace recycler { +namespace cppuddle { namespace detail { @@ -103,6 +104,18 @@ constexpr bool operator!=(hip_device_allocator const &, } // end namespace detail + +namespace device_selection { +template +struct select_device_functor> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +template +struct select_device_functor> { + void operator()(const size_t device_id) { hipSetDevice(device_id); } +}; +} // namespace device_selection + template ::value, int> = 0> using recycle_allocator_hip_host = detail::aggressive_recycle_allocator>; @@ -110,7 +123,6 @@ template ::value, int> = 0> using recycle_allocator_hip_device = detail::recycle_allocator>; -// TODO Is this even required? (cuda version should work fine...) template ::value, int> = 0> struct hip_device_buffer { recycle_allocator_hip_device allocator; @@ -157,16 +169,5 @@ struct hip_aggregated_device_buffer { // for the entire lifetime of this buffer }; -namespace device_selection { -template -struct select_device_functor> { - void operator()(const size_t device_id) { hipSetDevice(device_id); } -}; -template -struct select_device_functor> { - void operator()(const size_t device_id) { hipSetDevice(device_id); } -}; -} // namespace device_selection - -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/kokkos_buffer_util.hpp b/include/recycling_kokkos_view.hpp similarity index 76% rename from include/kokkos_buffer_util.hpp rename to include/recycling_kokkos_view.hpp index 2945b422..d89dc0c4 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/recycling_kokkos_view.hpp @@ -7,10 +7,10 @@ #define KOKKOS_BUFFER_UTIL_HPP #include #include -#include +#include #include -namespace recycler { +namespace cppuddle { template struct view_deleter { @@ -24,7 +24,7 @@ struct view_deleter { }; template -class aggregated_recycled_view : public kokkos_type { +class aggregated_recycle_view : public kokkos_type { private: alloc_type allocator; size_t total_elements{0}; @@ -34,7 +34,7 @@ class aggregated_recycled_view : public kokkos_type { public: using view_type = kokkos_type; template - explicit aggregated_recycled_view(alloc_type &alloc, Args... args) + explicit aggregated_recycle_view(alloc_type &alloc, Args... args) : kokkos_type( alloc.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -45,15 +45,15 @@ class aggregated_recycled_view : public kokkos_type { data_ref_counter(this->data(), view_deleter( alloc, total_elements)) {} - aggregated_recycled_view( - const aggregated_recycled_view &other) + aggregated_recycle_view( + const aggregated_recycle_view &other) : kokkos_type(other), allocator(other.allocator) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - aggregated_recycled_view & - operator=(const aggregated_recycled_view &other) { + aggregated_recycle_view & + operator=(const aggregated_recycle_view &other) { data_ref_counter = other.data_ref_counter; allocator = other.allocator; kokkos_type::operator=(other); @@ -61,15 +61,15 @@ class aggregated_recycled_view : public kokkos_type { return *this; } - aggregated_recycled_view( - aggregated_recycled_view &&other) noexcept + aggregated_recycle_view( + aggregated_recycle_view &&other) noexcept : kokkos_type(other), allocator(other.allocator) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - aggregated_recycled_view &operator=( - aggregated_recycled_view &&other) noexcept { + aggregated_recycle_view &operator=( + aggregated_recycle_view &&other) noexcept { data_ref_counter = other.data_ref_counter; allocator = other.allocator; kokkos_type::operator=(other); @@ -77,12 +77,12 @@ class aggregated_recycled_view : public kokkos_type { return *this; } - ~aggregated_recycled_view() {} + ~aggregated_recycle_view() {} }; template -class recycled_view : public kokkos_type { +class recycle_view : public kokkos_type { private: size_t total_elements{0}; std::shared_ptr data_ref_counter; @@ -92,7 +92,7 @@ class recycled_view : public kokkos_type { static_assert(std::is_same_v); template = true> - recycled_view(Args... args) + recycle_view(Args... args) : kokkos_type( alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -104,7 +104,7 @@ class recycled_view : public kokkos_type { template = true> - recycled_view(const size_t device_id, Args... args) + recycle_view(const size_t device_id, Args... args) : kokkos_type( alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -117,7 +117,7 @@ class recycled_view : public kokkos_type { template < typename layout_t, std::enable_if_t::value, bool> = true> - recycled_view(std::size_t device_id, layout_t layout) + recycle_view(std::size_t device_id, layout_t layout) : kokkos_type( alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) / sizeof(element_type)), @@ -127,41 +127,41 @@ class recycled_view : public kokkos_type { data_ref_counter(this->data(), view_deleter( alloc_type{device_id}, total_elements)) {} - recycled_view( - const recycled_view &other) + recycle_view( + const recycle_view &other) : kokkos_type(other) { total_elements = other.total_elements; data_ref_counter = other.data_ref_counter; } - recycled_view & - operator=(const recycled_view &other) { + recycle_view & + operator=(const recycle_view &other) { data_ref_counter = other.data_ref_counter; kokkos_type::operator=(other); total_elements = other.total_elements; return *this; } - recycled_view( - recycled_view &&other) noexcept + recycle_view( + recycle_view &&other) noexcept : kokkos_type(other) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - recycled_view &operator=( - recycled_view &&other) noexcept { + recycle_view &operator=( + recycle_view &&other) noexcept { data_ref_counter = other.data_ref_counter; kokkos_type::operator=(other); total_elements = other.total_elements; return *this; } - ~recycled_view() { } + ~recycle_view() { } }; -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_recycling_allocators.hpp similarity index 93% rename from include/sycl_buffer_util.hpp rename to include/sycl_recycling_allocators.hpp index 61d22f8f..63511de5 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_recycling_allocators.hpp @@ -3,17 +3,17 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef SYCL_BUFFER_UTIL_HPP -#define SYCL_BUFFER_UTIL_HPP +#ifndef SYCL_RECYCLING_ALLOCATORS_HPP +#define SYCL_RECYCLING_ALLOCATORS_HPP -#include "buffer_manager.hpp" +#include "detail/buffer_recycler.hpp" +#include "detail/config.hpp" #include #include #include -namespace recycler { - +namespace cppuddle { namespace detail { static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); @@ -79,5 +79,5 @@ template ::value, int> = 0> using recycle_allocator_sycl_device = detail::recycle_allocator>; -} // end namespace recycler +} // end namespace cppuddle #endif diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index c3c09217..1bc8cf53 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -3,8 +3,8 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/buffer_manager.hpp" -#include "../include/aligned_buffer_util.hpp" +#include "../include/detail/buffer_recycler.hpp" +#include "../include/aligned_recycling_allocators.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -79,7 +79,7 @@ int main(int argc, char *argv[]) { << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> + std::vector> test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += @@ -92,8 +92,8 @@ int main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison // Recycle Test: @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( + std::vector> test1( array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += @@ -114,8 +114,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison // Same test using std::allocator: @@ -146,7 +146,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - recycler::print_performance_counters(); + cppuddle::print_performance_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 9d8cc44b..d0e632f0 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -15,7 +15,7 @@ #include -#include "../include/buffer_manager.hpp" +#include "../include/detail/buffer_recycler.hpp" int hpx_main(int argc, char *argv[]) { @@ -112,7 +112,7 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6(array_size, + std::vector> test6(array_size, double{}); }); } @@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison // ensure that at least 4 buffers have to created for unit testing { - std::vector> buffer1( + std::vector> buffer1( array_size, double{}); - std::vector> buffer2( + std::vector> buffer2( array_size, double{}); - std::vector> buffer3( + std::vector> buffer3( array_size, double{}); - std::vector> buffer4( + std::vector> buffer4( array_size, double{}); } @@ -153,7 +153,7 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( + std::vector> test6( array_size, double{}); }); } @@ -167,8 +167,8 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 004368a4..54fb5dee 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -3,7 +3,7 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/buffer_manager.hpp" +#include "../include/detail/buffer_recycler.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( + std::vector> test1( array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += @@ -88,8 +88,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison // Recycle Test: @@ -97,7 +97,7 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1(array_size, double{}); + std::vector> test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += std::chrono::duration_cast(end - begin) @@ -108,8 +108,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_performance_counters(); + cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better // comparison // Same test using std::allocator: @@ -138,7 +138,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - recycler::print_performance_counters(); + cppuddle::print_performance_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else From d9ae1f82cb30460aa941f9e2bd98c379a1867487 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 6 Mar 2024 18:09:50 +0100 Subject: [PATCH 02/19] Rework allocator namespace 2 --- include/aligned_recycling_allocators.hpp | 7 ++- include/cuda_recycling_allocators.hpp | 13 +++-- ...fer_recycler.hpp => buffer_management.hpp} | 50 ++++++++----------- include/hip_recycling_allocators.hpp | 14 ++++-- include/recycling_kokkos_view.hpp | 4 +- include/sycl_recycling_allocators.hpp | 14 ++++-- tests/allocator_aligned_test.cpp | 11 ++-- tests/allocator_hpx_test.cpp | 10 ++-- ...llocator_kokkos_executor_for_loop_test.cpp | 20 ++++---- tests/allocator_kokkos_test.cpp | 10 ++-- tests/allocator_test.cpp | 17 ++++--- 11 files changed, 93 insertions(+), 77 deletions(-) rename include/detail/{buffer_recycler.hpp => buffer_management.hpp} (96%) diff --git a/include/aligned_recycling_allocators.hpp b/include/aligned_recycling_allocators.hpp index b1ed5dce..039a19f2 100644 --- a/include/aligned_recycling_allocators.hpp +++ b/include/aligned_recycling_allocators.hpp @@ -6,22 +6,27 @@ #ifndef ALIGNED_RECYCLING_ALLOCATORS_HPP #define ALIGNED_RECYCLING_ALLOCATORS_HPP -#include "detail/buffer_recycler.hpp" #include +#include "buffer_management_interface.hpp" namespace cppuddle { namespace device_selection { template +/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / +/// select_device_functor does not compile for > 1 GPU (to make sure all / +/// relevant allocators support multigpu) struct select_device_functor< T, boost::alignment::aligned_allocator> { void operator()(const size_t device_id) {} }; } // namespace device_selection +/// Recycling allocator for boost aligned memory template ::value, int> = 0> using recycle_aligned = detail::recycle_allocator< T, boost::alignment::aligned_allocator>; +/// Recycling allocator for boost aligned memory (reusing previous content as well) template ::value, int> = 0> using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< diff --git a/include/cuda_recycling_allocators.hpp b/include/cuda_recycling_allocators.hpp index 2452a563..b4cf8efb 100644 --- a/include/cuda_recycling_allocators.hpp +++ b/include/cuda_recycling_allocators.hpp @@ -6,16 +6,16 @@ #ifndef CUDA_RECYCLING_ALLOCATORS_HPP #define CUDA_RECYCLING_ALLOCATORS_HPP -#include "detail/buffer_recycler.hpp" -#include "detail/config.hpp" - #include #include #include +#include "buffer_management_interface.hpp" + namespace cppuddle { namespace detail { +/// Underlying host allocator for CUDA pinned memory template struct cuda_pinned_allocator { using value_type = T; cuda_pinned_allocator() noexcept = default; @@ -57,6 +57,7 @@ constexpr bool operator!=(cuda_pinned_allocator const &, return false; } +/// Underlying allocator for CUDA device memory template struct cuda_device_allocator { using value_type = T; cuda_device_allocator() noexcept = default; @@ -99,23 +100,28 @@ constexpr bool operator!=(cuda_device_allocator const &, namespace device_selection { +/// GPU device selector using the CUDA API for pinned host allocations template struct select_device_functor> { void operator()(const size_t device_id) { cudaSetDevice(device_id); } }; +/// GPU selector using the CUDA API for pinned host allocations template struct select_device_functor> { void operator()(const size_t device_id) { cudaSetDevice(device_id); } }; } // namespace device_selection +/// Recycling allocator for CUDA pinned host memory template ::value, int> = 0> using recycle_allocator_cuda_host = detail::aggressive_recycle_allocator>; +/// Recycling allocator for CUDA device memory template ::value, int> = 0> using recycle_allocator_cuda_device = detail::recycle_allocator>; +/// RAII wrapper for CUDA device memory template ::value, int> = 0> struct cuda_device_buffer { recycle_allocator_cuda_device allocator; @@ -139,6 +145,7 @@ struct cuda_device_buffer { }; +/// RAII wrapper for CUDA device memory using a passed aggregated allocator template ::value, int> = 0> struct cuda_aggregated_device_buffer { T *device_side_buffer; diff --git a/include/detail/buffer_recycler.hpp b/include/detail/buffer_management.hpp similarity index 96% rename from include/detail/buffer_recycler.hpp rename to include/detail/buffer_management.hpp index 3ad739d8..5d640983 100644 --- a/include/detail/buffer_recycler.hpp +++ b/include/detail/buffer_management.hpp @@ -3,8 +3,8 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef BUFFER_MANAGER_HPP -#define BUFFER_MANAGER_HPP +#ifndef BUFFER_MANAGEMENT_HPP +#define BUFFER_MANAGEMENT_HPP #include #include @@ -47,6 +47,10 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR namespace cppuddle { namespace device_selection { +/// Default device selector - No MultGPU support +/** Throws a runtime error if max_number_gpus > 1 (defined by cmake variable + * CPPUDDLE_WITH_MAX_NUMBER_GPUS). Needs to be specialized for an allocator to + * provide MultiGPU support (see CPPuddle CUDA/HIP allocators for examples) **/ template struct select_device_functor { void operator()(const size_t device_id) { if constexpr (max_number_gpus > 1) @@ -55,14 +59,11 @@ template struct select_device_functor { "(by having a select_device_functor overload"); } }; -template struct select_device_functor> { - void operator()(const size_t device_id) {} -}; } // namespace device_selection namespace detail { - +/// Singleton interface to all buffer_managers class buffer_interface { public: #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING) @@ -87,12 +88,14 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON return Host_Allocator{}.deallocate(p, number_elements); } #else - /// Returns and allocated buffer of the requested size - this may be a reused - /// buffer + /// Primary method to allocate a buffer with CPPuddle: Returns and allocated / + /// buffer of the requested size - this may be a reused buffer. The method + /// figures out the correct buffer_manager and gets such a buffer from it. + /// Should be called from an allocator implementation, not directly template static T *get(size_t number_elements, bool manage_content_lifetime = false, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { try { return buffer_manager::get( number_elements, manage_content_lifetime, location_hint, device_id); @@ -102,11 +105,14 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON throw; } } - /// Marks an buffer as unused and fit for reusage + /// Primary method to deallocate a buffer with CPPuddle:Marks an buffer as / + /// unused and fit for reusage. The method figures out the correct buffer + /// manager and marks the buffer there. Should be called from an allocator + /// implementation, not directly template static void mark_unused(T *p, size_t number_elements, - std::optional location_hint = std::nullopt, - std::optional device_id = std::nullopt) { + std::optional location_hint = std::nullopt, + std::optional device_id = std::nullopt) { try { return buffer_manager::mark_unused(p, number_elements, location_hint, device_id); @@ -117,6 +123,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON } } #endif + /// Register all CPPuddle counters as HPX performance counters template static void register_allocator_counters_with_hpx(void) { #ifdef CPPUDDLE_HAVE_COUNTERS @@ -915,23 +922,6 @@ operator!=(aggressive_recycle_allocator const &, return true; } } // namespace detail - -template ::value, int> = 0> -using recycle_std = detail::recycle_allocator>; -template ::value, int> = 0> -using aggressive_recycle_std = - detail::aggressive_recycle_allocator>; - -inline void print_performance_counters() { detail::buffer_interface::print_performance_counters(); } -/// Deletes all buffers (even ones still marked as used), delete the buffer -/// managers and the recycler itself -inline void force_cleanup() { detail::buffer_interface::clean_all(); } -/// Deletes all buffers currently marked as unused -inline void cleanup() { detail::buffer_interface::clean_unused_buffers(); } -/// Deletes all buffers (even ones still marked as used), delete the buffer -/// managers and the recycler itself. Disallows further usage. -inline void finalize() { detail::buffer_interface::finalize(); } - } // end namespace cppuddle #endif diff --git a/include/hip_recycling_allocators.hpp b/include/hip_recycling_allocators.hpp index 465bd5fe..f540b544 100644 --- a/include/hip_recycling_allocators.hpp +++ b/include/hip_recycling_allocators.hpp @@ -6,17 +6,16 @@ #ifndef HIP_RECYCLING_ALLOCATORS_HPP #define HIP_RECYCLING_ALLOCATORS_HPP -#include "detail/buffer_recycler.hpp" -#include "detail/config.hpp" - #include #include #include -namespace cppuddle { +#include "buffer_management_interface.hpp" +namespace cppuddle { namespace detail { +/// Underlying host allocator for HIP pinned memory template struct hip_pinned_allocator { using value_type = T; hip_pinned_allocator() noexcept = default; @@ -63,6 +62,7 @@ constexpr bool operator!=(hip_pinned_allocator const &, return false; } +/// Underlying allocator for HIP device memory template struct hip_device_allocator { using value_type = T; hip_device_allocator() noexcept = default; @@ -106,23 +106,28 @@ constexpr bool operator!=(hip_device_allocator const &, namespace device_selection { +/// GPU device selector using the HIP API for pinned host allocations template struct select_device_functor> { void operator()(const size_t device_id) { hipSetDevice(device_id); } }; +/// GPU selector using the HIP API for pinned host allocations template struct select_device_functor> { void operator()(const size_t device_id) { hipSetDevice(device_id); } }; } // namespace device_selection +/// Recycling allocator for HIP pinned host memory template ::value, int> = 0> using recycle_allocator_hip_host = detail::aggressive_recycle_allocator>; +/// Recycling allocator for HIP device memory template ::value, int> = 0> using recycle_allocator_hip_device = detail::recycle_allocator>; +/// RAII wrapper for HIP device memory template ::value, int> = 0> struct hip_device_buffer { recycle_allocator_hip_device allocator; @@ -146,6 +151,7 @@ struct hip_device_buffer { }; +/// RAII wrapper for CUDA device memory using a passed aggregated allocator template ::value, int> = 0> struct hip_aggregated_device_buffer { T *device_side_buffer; diff --git a/include/recycling_kokkos_view.hpp b/include/recycling_kokkos_view.hpp index d89dc0c4..c55d3738 100644 --- a/include/recycling_kokkos_view.hpp +++ b/include/recycling_kokkos_view.hpp @@ -7,9 +7,11 @@ #define KOKKOS_BUFFER_UTIL_HPP #include #include -#include #include +#include "buffer_management_interface.hpp" + + namespace cppuddle { template diff --git a/include/sycl_recycling_allocators.hpp b/include/sycl_recycling_allocators.hpp index 63511de5..66ba1fb8 100644 --- a/include/sycl_recycling_allocators.hpp +++ b/include/sycl_recycling_allocators.hpp @@ -6,18 +6,21 @@ #ifndef SYCL_RECYCLING_ALLOCATORS_HPP #define SYCL_RECYCLING_ALLOCATORS_HPP -#include "detail/buffer_recycler.hpp" -#include "detail/config.hpp" - #include #include #include +#include "buffer_management_interface.hpp" + namespace cppuddle { -namespace detail { +namespace device_selection { +// No MutliGPU support yet, hence no select_device_function required static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); +} // namespace device_selection +namespace detail { +/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{}) template struct sycl_host_default_allocator { using value_type = T; sycl_host_default_allocator() noexcept = default; @@ -44,6 +47,7 @@ constexpr bool operator!=(sycl_host_default_allocator const &, return false; } +/// Underlying allocator for SYCL device memory (using the sycl::default_selector{}) template struct sycl_device_default_allocator { using value_type = T; sycl_device_default_allocator() noexcept = default; @@ -72,9 +76,11 @@ constexpr bool operator!=(sycl_device_default_allocator const &, } // end namespace detail +/// Recycling allocator for SYCL pinned host memory (default device) template ::value, int> = 0> using recycle_allocator_sycl_host = detail::aggressive_recycle_allocator>; +/// Recycling allocator for SYCL device memory (default device) template ::value, int> = 0> using recycle_allocator_sycl_device = detail::recycle_allocator>; diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 1bc8cf53..65d1df64 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/detail/buffer_recycler.hpp" #include "../include/aligned_recycling_allocators.hpp" #ifdef CPPUDDLE_HAVE_HPX #include @@ -92,8 +91,8 @@ int main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Recycle Test: @@ -114,8 +113,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Same test using std::allocator: @@ -146,7 +145,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - cppuddle::print_performance_counters(); + cppuddle::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index d0e632f0..4af0878c 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -15,7 +15,7 @@ #include -#include "../include/detail/buffer_recycler.hpp" +#include "std_recycling_allocators.hpp" int hpx_main(int argc, char *argv[]) { @@ -126,8 +126,8 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison @@ -167,8 +167,8 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index 7708fe56..47fc83f4 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -18,12 +18,12 @@ #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" -#include "../include/kokkos_buffer_util.hpp" #include #include +#include "cuda_recycling_allocators.hpp" +#include "recycling_kokkos_view.hpp" + // Assert during Release builds as well for this file: #undef NDEBUG #include // reinclude the header to update the definition of assert() @@ -37,7 +37,7 @@ using kokkos_um_array = Kokkos::View; template using recycled_host_view = - recycler::recycled_view, recycler::recycle_std, T>; + cppuddle::recycled_view, cppuddle::recycle_std, T>; // Device views using recycle allocators @@ -46,8 +46,8 @@ using kokkos_um_device_array = Kokkos::View; template using recycled_device_view = - recycler::recycled_view, - recycler::recycle_allocator_cuda_device, T>; + cppuddle::recycled_view, + cppuddle::recycle_allocator_cuda_device, T>; // Host views using pinned memory recycle allocators template @@ -56,8 +56,8 @@ using kokkos_um_pinned_array = Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>; template using recycled_pinned_view = - recycler::recycled_view, - recycler::recycle_allocator_cuda_host, T>; + cppuddle::recycled_view, + cppuddle::recycle_allocator_cuda_host, T>; template auto get_iteration_policy(const Executor &&executor, @@ -143,11 +143,11 @@ int main(int argc, char *argv[]) { // otherwise the HPX cuda polling futures won't work hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); - recycler::print_performance_counters(); + cppuddle::print_buffer_counters(); // Cleanup all cuda views // (otherwise the cuda driver might shut down before this gets done automatically at // the end of the programm) - recycler::force_cleanup(); + cppuddle::force_buffer_cleanup(); return hpx::finalize(); } diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index e2770458..c8045d3e 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -13,9 +13,6 @@ #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" -#include "../include/kokkos_buffer_util.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -24,6 +21,9 @@ #include #include +#include "cuda_recycling_allocators.hpp" +#include "recycling_kokkos_view.hpp" + using kokkos_array = Kokkos::View; @@ -33,7 +33,7 @@ using kokkos_um_array = Kokkos::View; template using recycled_host_view = - recycler::recycled_view, recycler::recycle_std, T>; + cppuddle::recycled_view, cppuddle::recycle_std, T>; #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { @@ -91,7 +91,7 @@ int main(int argc, char *argv[]) { }); Kokkos::fence(); } - recycler::print_performance_counters(); + cppuddle::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 54fb5dee..8fc7c5bb 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/detail/buffer_recycler.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -17,6 +16,8 @@ #include #include +#include "std_recycling_allocators.hpp" + #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { #else @@ -88,9 +89,9 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for + // better comparison // Recycle Test: { @@ -108,9 +109,9 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_performance_counters(); - cppuddle::force_cleanup(); // Cleanup all buffers and the managers for better - // comparison + cppuddle::print_buffer_counters(); + cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for + // better comparison // Same test using std::allocator: { @@ -138,7 +139,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - cppuddle::print_performance_counters(); + cppuddle::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else From 0857471e17b5b6ab0cf94c762fa2c6ee690a8dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 6 Mar 2024 18:10:07 +0100 Subject: [PATCH 03/19] Add compatibility with old interface Only comes with deprecation warnings... --- include/aligned_buffer_util.hpp | 24 ++++++++++++++ include/buffer_management_interface.hpp | 21 +++++++++++++ include/buffer_manager.hpp | 24 ++++++++++++++ include/cuda_buffer_util.hpp | 42 +++++++++++++++++++++++++ include/hip_buffer_util.hpp | 40 +++++++++++++++++++++++ include/kokkos_buffer_util.hpp | 21 +++++++++++++ include/std_recycling_allocators.hpp | 32 +++++++++++++++++++ include/sycl_buffer_util.hpp | 33 +++++++++++++++++++ 8 files changed, 237 insertions(+) create mode 100644 include/aligned_buffer_util.hpp create mode 100644 include/buffer_management_interface.hpp create mode 100644 include/buffer_manager.hpp create mode 100644 include/cuda_buffer_util.hpp create mode 100644 include/hip_buffer_util.hpp create mode 100644 include/kokkos_buffer_util.hpp create mode 100644 include/std_recycling_allocators.hpp create mode 100644 include/sycl_buffer_util.hpp diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp new file mode 100644 index 00000000..4c409521 --- /dev/null +++ b/include/aligned_buffer_util.hpp @@ -0,0 +1,24 @@ +// Copyright (c) 2020-2021 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef ALIGNED_BUFFER_UTIL_HPP +#define ALIGNED_BUFFER_UTIL_HPP + +#include "aligned_recycling_allocators.hpp" + +namespace recycler { + +[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_aligned = cppuddle::recycle_aligned; +[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using aggressive_recycle_aligned = cppuddle::aggressive_recycle_aligned; + +} // namespace recycler + +#endif diff --git a/include/buffer_management_interface.hpp b/include/buffer_management_interface.hpp new file mode 100644 index 00000000..b38fc84c --- /dev/null +++ b/include/buffer_management_interface.hpp @@ -0,0 +1,21 @@ +#ifndef BUFFER_MANAGEMENT_INTERFACE_HPP +#define BUFFER_MANAGEMENT_INTERFACE_HPP + +#include "detail/buffer_management.hpp" + +namespace cppuddle { + +/// Print performance counters of all buffer managers to stdout +inline void print_buffer_counters() { detail::buffer_interface::print_performance_counters(); } +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself +inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); } +/// Deletes all buffers currently marked as unused +inline void unused_buffer_cleanup() { detail::buffer_interface::clean_unused_buffers(); } +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself. Disallows further usage. +inline void finalize() { detail::buffer_interface::finalize(); } + +} // end namespace cppuddle + +#endif diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp new file mode 100644 index 00000000..f84c4259 --- /dev/null +++ b/include/buffer_manager.hpp @@ -0,0 +1,24 @@ +#ifndef BUFFER_MANAGER_INTERFACE_HPP +#define BUFFER_MANAGER_HPP + +#include "buffer_management_interface.hpp" + +namespace recycler { + +[[deprecated("Use cppuddle::print_buffer_counters() instead")]] +inline void print_performance_counters() { cppuddle::print_buffer_counters(); } +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself +[[deprecated("Use cppuddle::force_buffer_cleanup() instead")]] +inline void force_cleanup() { cppuddle::force_buffer_cleanup(); } +/// Deletes all buffers currently marked as unused +[[deprecated("Use cppuddle::unused_buffer_cleanup() instead")]] +inline void cleanup() { cppuddle::unused_buffer_cleanup(); } +/// Deletes all buffers (even ones still marked as used), delete the buffer +/// managers and the recycler itself. Disallows further usage. +[[deprecated("Use cppuddle::finalize() instead")]] +inline void finalize() { detail::buffer_interface::finalize(); } + +} // end namespace cppuddle + +#endif diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp new file mode 100644 index 00000000..f0db9a7b --- /dev/null +++ b/include/cuda_buffer_util.hpp @@ -0,0 +1,42 @@ +// Copyright (c) 2020-2023 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_BUFFER_UTIL_HPP +#define CUDA_BUFFER_UTIL_HPP + +#include "cuda_recycling_allocators.hpp" +namespace recycler { + +namespace detail { + +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template +using cuda_pinned_allocator = cppuddle::detail::cuda_pinned_allocator; + +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template +using cuda_device_allocator = cppuddle::detail::cuda_device_allocator; + +} // end namespace detail + +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_cuda_host = + cppuddle::recycle_allocator_cuda_host; +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_cuda_device = + cppuddle::recycle_allocator_cuda_device; + +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using cuda_device_buffer = cppuddle::cuda_device_buffer; + +[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using cuda_aggregated_device_buffer = cppuddle::cuda_aggregated_device_buffer; + +} // end namespace recycler +#endif diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp new file mode 100644 index 00000000..a0b6fc05 --- /dev/null +++ b/include/hip_buffer_util.hpp @@ -0,0 +1,40 @@ +// Copyright (c: 2020-2021 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_BUFFER_UTIL_HPP +#define HIP_BUFFER_UTIL_HPP + +#include "hip_recycling_allocators.hpp" + +namespace recycler { + +namespace detail { + +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template +using hip_pinned_allocator = cppuddle::detail::hip_pinned_allocator; + +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template +using hip_device_allocator = cppuddle::detail::hip_device_allocator; +} // end namespace detail + +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_hip_host = cppuddle::recycle_allocator_hip_host; +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_hip_device = cppuddle::recycle_allocator_hip_device; + +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using hip_device_buffer = cppuddle::hip_device_buffer; + +[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using hip_aggregated_device_buffer = cppuddle::hip_aggregated_device_buffer; + +} // end namespace recycler +#endif diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp new file mode 100644 index 00000000..22fb4d88 --- /dev/null +++ b/include/kokkos_buffer_util.hpp @@ -0,0 +1,21 @@ +// Copyright (c) 2020-2021 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef KOKKOS_BUFFER_UTIL_HPP +#define KOKKOS_BUFFER_UTIL_HPP +#include "recycling_kokkos_view.hpp" + +[[deprecated("Use aggregated_recycle_view from header recycling_kokkos_view.hpp instead")]] +namespace recycler { +template +using aggregated_recycled_view = cppuddle::aggregated_recycle_view; + +[[deprecated("Use recycle_view from header recycling_kokkos_view.hpp instead")]] +template +using recycled_view = cppuddle::recycle_view; + +} // end namespace recycler + +#endif diff --git a/include/std_recycling_allocators.hpp b/include/std_recycling_allocators.hpp new file mode 100644 index 00000000..a62390dd --- /dev/null +++ b/include/std_recycling_allocators.hpp @@ -0,0 +1,32 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef STD_RECYCLING_ALLOCATORS_HPP +#define STD_RECYCLING_ALLOCATORS_HPP + +#include "buffer_management_interface.hpp" + +namespace cppuddle { +namespace device_selection { +/// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / +/// select_device_functor does not compile for > 1 GPU (to make sure all / +/// relevant allocators support multigpu) +template struct select_device_functor> { + void operator()(const size_t device_id) {} +}; +} // namespace device_selection + + +/// Recycling allocator for std memory +template ::value, int> = 0> +using recycle_std = detail::recycle_allocator>; +/// Recycling allocator for boost aligned memory (reusing previous content as well) +template ::value, int> = 0> +using aggressive_recycle_std = + detail::aggressive_recycle_allocator>; + +} // namespace cppuddle + +#endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp new file mode 100644 index 00000000..f7971e1c --- /dev/null +++ b/include/sycl_buffer_util.hpp @@ -0,0 +1,33 @@ +// Copyright (c: 2020-2021 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SYCL_BUFFER_UTIL_HPP +#define SYCL_BUFFER_UTIL_HPP + +#include "sycl_recycling_allocators.hpp" + +namespace recycler { + +namespace detail { + +[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] +template +using sycl_host_default_allocator = cppuddle::detail::sycl_host_default_allocator; + +[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] +template +using sycl_device_default_allocator = cppuddle::detail::sycl_device_default_allocator; + +} // end namespace detail + +[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_sycl_host = cppuddle::recycle_allocator_sycl_host; +[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] +template ::value, int> = 0> +using recycle_allocator_sycl_device = cppuddle::recycle_allocator_sycl_device; + +} // end namespace recycler +#endif From e10d02166b3d1347bec4f731471fa23b41ba761b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 7 Mar 2024 11:47:55 +0100 Subject: [PATCH 04/19] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 2b25a731..bdc68a8d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ spack.lock .clangd docs compile_commands.json +spack-build* +spack-configure-args.txt From 0f06ea8c16b93f35634c05814b10878a791631da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 7 Mar 2024 11:48:41 +0100 Subject: [PATCH 05/19] Rework allocator namespace 3 --- include/aggregation_manager.hpp | 64 +++++++++---------- include/aligned_buffer_util.hpp | 11 ++-- include/buffer_manager.hpp | 19 +++++- include/cuda_buffer_util.hpp | 40 +++++++----- include/hip_buffer_util.hpp | 42 +++++++----- include/kokkos_buffer_util.hpp | 12 ++-- include/stream_manager.hpp | 32 +++++----- include/sycl_buffer_util.hpp | 25 +++++--- ...llocator_kokkos_executor_for_loop_test.cpp | 19 +++--- tests/allocator_kokkos_test.cpp | 9 +-- 10 files changed, 157 insertions(+), 116 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index cd1ca74b..99468658 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -58,7 +58,7 @@ #pragma message \ "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex" #endif -namespace recycler { +namespace cppuddle { using aggregation_mutex_t = hpx::mutex; } @@ -158,7 +158,7 @@ template class aggregated_function_call { std::any function_tuple; /// Stores the string of the first function call for debug output std::string debug_type_information; - recycler::aggregation_mutex_t debug_mut; + cppuddle::aggregation_mutex_t debug_mut; #endif std::vector> potential_async_promises{}; @@ -189,7 +189,7 @@ template class aggregated_function_call { #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) // needed for concurrent access to function_tuple and debug_type_information // Not required for normal use - std::lock_guard guard(debug_mut); + std::lock_guard guard(debug_mut); #endif assert(!async_mode); assert(potential_async_promises.empty()); @@ -263,7 +263,7 @@ template class aggregated_function_call { #if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) // needed for concurrent access to function_tuple and debug_type_information // Not required for normal use - std::lock_guard guard(debug_mut); + std::lock_guard guard(debug_mut); #endif assert(async_mode); assert(!potential_async_promises.empty()); @@ -545,7 +545,7 @@ template class Aggregated_Executor { /// slices have called it std::deque> function_calls; /// For synchronizing the access to the function calls list - recycler::aggregation_mutex_t mut; + cppuddle::aggregation_mutex_t mut; /// Data entry for a buffer allocation: void* pointer, size_t for /// buffer-size, atomic for the slice counter, location_id, gpu_id @@ -556,7 +556,7 @@ template class Aggregated_Executor { /// Map pointer to deque index for fast access in the deallocations std::unordered_map buffer_allocations_map; /// For synchronizing the access to the buffer_allocations - recycler::aggregation_mutex_t buffer_mut; + cppuddle::aggregation_mutex_t buffer_mut; std::atomic buffer_counter = 0; /// Get new buffer OR get buffer already allocated by different slice @@ -569,7 +569,7 @@ template class Aggregated_Executor { // First: Check if it already has happened if (buffer_counter <= slice_alloc_counter) { // we might be the first! Lock... - std::lock_guard guard(buffer_mut); + std::lock_guard guard(buffer_mut); // ... and recheck if (buffer_counter <= slice_alloc_counter) { constexpr bool manage_content_lifetime = false; @@ -579,7 +579,7 @@ template class Aggregated_Executor { // many different buffers for different aggregation sizes on different GPUs /* size_t location_id = gpu_id * instances_per_gpu; */ // Use integer conversion to only use 0 16 32 ... as buckets - size_t location_id = ((hpx::get_worker_thread_num() % recycler::number_instances) / 16) * 16; + size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; #ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS if (max_slices == 1) { // get prefered location: aka the current hpx threads location @@ -594,7 +594,7 @@ template class Aggregated_Executor { // Buffer might be recycled from previous allocations by the // buffer_interface... T *aggregated_buffer = - recycler::detail::buffer_interface::get( + cppuddle::detail::buffer_interface::get( size, manage_content_lifetime, location_id, gpu_id); // Create buffer entry for this buffer buffer_allocations.emplace_back(static_cast(aggregated_buffer), @@ -665,12 +665,12 @@ template class Aggregated_Executor { // Check if all slices are done with this buffer? if (buffer_allocation_counter == 0) { // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage - std::lock_guard guard(buffer_mut); + std::lock_guard guard(buffer_mut); // Only mark unused if another buffer has not done so already (and marked // it as invalid) if (valid) { assert(buffers_in_use == true); - recycler::detail::buffer_interface::mark_unused( + cppuddle::detail::buffer_interface::mark_unused( buffer_pointer, buffer_size, location_id, gpu_id); // mark buffer as invalid to prevent any other slice from marking the // buffer as unused @@ -678,7 +678,7 @@ template class Aggregated_Executor { const size_t current_deallocs = ++dealloc_counter; if (current_deallocs == buffer_counter) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); buffers_in_use = false; if (!executor_slices_alive && !buffers_in_use) { slices_exhausted = false; @@ -699,12 +699,12 @@ template class Aggregated_Executor { /// Only meant to be accessed by the slice executors bool sync_aggregation_slices(const size_t slice_launch_counter) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); assert(slices_exhausted == true); assert(executor_wrapper); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ + /* std::lock_guard guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, *executor_wrapper); overall_launch_counter = function_calls.size(); @@ -720,12 +720,12 @@ template class Aggregated_Executor { /// Only meant to be accessed by the slice executors template void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); assert(slices_exhausted == true); assert(executor_wrapper); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ + /* std::lock_guard guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, false, *executor_wrapper); overall_launch_counter = function_calls.size(); @@ -744,12 +744,12 @@ template class Aggregated_Executor { template hpx::lcos::future async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); assert(slices_exhausted == true); assert(executor_wrapper); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ + /* std::lock_guard guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, *executor_wrapper); overall_launch_counter = function_calls.size(); @@ -765,12 +765,12 @@ template class Aggregated_Executor { template hpx::lcos::shared_future wrap_async(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); assert(slices_exhausted == true); assert(executor_wrapper); // Add function call object in case it hasn't happened for this launch yet if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ + /* std::lock_guard guard(mut); */ if (overall_launch_counter <= slice_launch_counter) { function_calls.emplace_back(current_slices, true, *executor_wrapper); overall_launch_counter = function_calls.size(); @@ -784,12 +784,12 @@ template class Aggregated_Executor { } bool slice_available(void) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); return !slices_exhausted; } std::optional> request_executor_slice() { - std::lock_guard guard(mut); + std::lock_guard guard(mut); if (!slices_exhausted) { const size_t local_slice_id = ++current_slices; if (local_slice_id == 1) { @@ -797,7 +797,7 @@ template class Aggregated_Executor { // TODO still required? Should be clean here already function_calls.clear(); overall_launch_counter = 0; - std::lock_guard guard(buffer_mut); + std::lock_guard guard(buffer_mut); #ifndef NDEBUG for (const auto &buffer_entry : buffer_allocations) { const auto &[buffer_pointer_any, buffer_size, @@ -861,7 +861,7 @@ template class Aggregated_Executor { } // Launch all executor slices within this continuation current_continuation = fut.then([this](auto &&fut) { - std::lock_guard guard(mut); + std::lock_guard guard(mut); slices_exhausted = true; launched_slices = current_slices; size_t id = 0; @@ -898,7 +898,7 @@ template class Aggregated_Executor { } size_t launched_slices; void reduce_usage_counter(void) { - /* std::lock_guard guard(mut); */ + /* std::lock_guard guard(mut); */ assert(slices_exhausted == true); assert(executor_wrapper); assert(executor_slices_alive == true); @@ -908,7 +908,7 @@ template class Aggregated_Executor { // Last slice goes out scope? if (local_slice_id == 0) { // Mark executor fit for reusage - std::lock_guard guard(mut); + std::lock_guard guard(mut); executor_slices_alive = false; if (!executor_slices_alive && !buffers_in_use) { // Release executor @@ -1045,7 +1045,7 @@ class aggregation_pool { std::string("Trying to initialize cppuddle aggregation pool twice") + " Agg pool name: " + std::string(kernelname)); } - if (num_devices > recycler::max_number_gpus) { + if (num_devices > cppuddle::max_number_gpus) { throw std::runtime_error( std::string( "Trying to initialize aggregation with more devices than the " @@ -1055,7 +1055,7 @@ class aggregation_pool { number_devices = num_devices; for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { - std::lock_guard guard(instance()[gpu_id].pool_mutex); + std::lock_guard guard(instance()[gpu_id].pool_mutex); assert(instance()[gpu_id].aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, @@ -1074,9 +1074,9 @@ class aggregation_pool { std::string("Trying to use cppuddle aggregation pool without first calling init") + " Agg poolname: " + std::string(kernelname)); } - const size_t gpu_id = recycler::get_device_id(number_devices); + const size_t gpu_id = cppuddle::get_device_id(number_devices); /* const size_t gpu_id = 1; */ - std::lock_guard guard(instance()[gpu_id].pool_mutex); + std::lock_guard guard(instance()[gpu_id].pool_mutex); assert(!instance()[gpu_id].aggregation_executor_pool.empty()); std::optional::Executor_Slice>> @@ -1128,11 +1128,11 @@ class aggregation_pool { private: /// Required for dealing with adding elements to the deque of /// aggregated_executors - recycler::aggregation_mutex_t pool_mutex; + cppuddle::aggregation_mutex_t pool_mutex; /// Global access instance static std::unique_ptr& instance(void) { static std::unique_ptr pool_instances{ - new aggregation_pool[recycler::max_number_gpus]}; + new aggregation_pool[cppuddle::max_number_gpus]}; return pool_instances; } static inline size_t number_devices = 1; diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 4c409521..0e5f1fb7 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -10,14 +10,17 @@ namespace recycler { -[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using recycle_aligned = cppuddle::recycle_aligned; -[[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] +using recycle_aligned + [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = + cppuddle::recycle_aligned; + template ::value, int> = 0> -using aggressive_recycle_aligned = cppuddle::aggressive_recycle_aligned; +using aggressive_recycle_aligned + [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = + cppuddle::aggressive_recycle_aligned; } // namespace recycler diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index f84c4259..d9d74c31 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -2,11 +2,24 @@ #define BUFFER_MANAGER_HPP #include "buffer_management_interface.hpp" +#include "std_recycling_allocators.hpp" namespace recycler { -[[deprecated("Use cppuddle::print_buffer_counters() instead")]] -inline void print_performance_counters() { cppuddle::print_buffer_counters(); } +template ::value, int> = 0> +using recycle_std + [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = + cppuddle::recycle_std; + +template ::value, int> = 0> +using aggressive_recycle_aligned + [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = + cppuddle::aggressive_recycle_std; + +[[deprecated("Use cppuddle::print_buffer_counters() instead")]] +inline void print_performance_counters() { + cppuddle::print_buffer_counters(); +} /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself [[deprecated("Use cppuddle::force_buffer_cleanup() instead")]] @@ -17,7 +30,7 @@ inline void cleanup() { cppuddle::unused_buffer_cleanup(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself. Disallows further usage. [[deprecated("Use cppuddle::finalize() instead")]] -inline void finalize() { detail::buffer_interface::finalize(); } +inline void finalize() { cppuddle::finalize(); } } // end namespace cppuddle diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index f0db9a7b..ffe47f8b 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -7,36 +7,42 @@ #define CUDA_BUFFER_UTIL_HPP #include "cuda_recycling_allocators.hpp" -namespace recycler { +namespace recycler { namespace detail { -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] template -using cuda_pinned_allocator = cppuddle::detail::cuda_pinned_allocator; +using cuda_pinned_allocator + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::detail::cuda_pinned_allocator; -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] template -using cuda_device_allocator = cppuddle::detail::cuda_device_allocator; +using cuda_device_allocator + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::detail::cuda_device_allocator; } // end namespace detail -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using recycle_allocator_cuda_host = - cppuddle::recycle_allocator_cuda_host; -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] -template ::value, int> = 0> -using recycle_allocator_cuda_device = - cppuddle::recycle_allocator_cuda_device; +using recycle_allocator_cuda_host + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_cuda_host; -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using cuda_device_buffer = cppuddle::cuda_device_buffer; +using recycle_allocator_cuda_device + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_cuda_device; -[[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] -template ::value, int> = 0> -using cuda_aggregated_device_buffer = cppuddle::cuda_aggregated_device_buffer; +template ::value, int> = 0> +using cuda_device_buffer + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::cuda_device_buffer; + +template ::value, int> = 0> +using cuda_aggregated_device_buffer + [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = + cppuddle::cuda_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index a0b6fc05..a2b5ca0c 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -12,29 +12,37 @@ namespace recycler { namespace detail { -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] -template -using hip_pinned_allocator = cppuddle::detail::hip_pinned_allocator; - -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] -template -using hip_device_allocator = cppuddle::detail::hip_device_allocator; +template +using hip_pinned_allocator + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::detail::hip_pinned_allocator; + +template +using hip_device_allocator + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::detail::hip_device_allocator; } // end namespace detail -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] -template ::value, int> = 0> -using recycle_allocator_hip_host = cppuddle::recycle_allocator_hip_host; -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using recycle_allocator_hip_device = cppuddle::recycle_allocator_hip_device; +using recycle_allocator_hip_host + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_hip_host; -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using hip_device_buffer = cppuddle::hip_device_buffer; +using recycle_allocator_hip_device + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_hip_device; -[[deprecated("Use from header hip_recycling_allocators.hpp instead")]] -template ::value, int> = 0> -using hip_aggregated_device_buffer = cppuddle::hip_aggregated_device_buffer; +template ::value, int> = 0> +using hip_device_buffer + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::hip_device_buffer; + +template ::value, int> = 0> +using hip_aggregated_device_buffer + [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = + cppuddle::hip_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 22fb4d88..7b267619 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -7,14 +7,18 @@ #define KOKKOS_BUFFER_UTIL_HPP #include "recycling_kokkos_view.hpp" -[[deprecated("Use aggregated_recycle_view from header recycling_kokkos_view.hpp instead")]] + namespace recycler { template -using aggregated_recycled_view = cppuddle::aggregated_recycle_view; +using aggregated_recycled_view [[deprecated( + "Use aggregated_recycle_view from header recycling_kokkos_view.hpp " + "instead")]] = + cppuddle::aggregated_recycle_view; -[[deprecated("Use recycle_view from header recycling_kokkos_view.hpp instead")]] template -using recycled_view = cppuddle::recycle_view; +using recycled_view [[deprecated( + "Use recycle_view from header recycling_kokkos_view.hpp instead")]] = + cppuddle::recycle_view; } // end namespace recycler diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 5b0e3898..bfaba518 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -173,7 +173,7 @@ class stream_pool { template static size_t get_next_device_id(const size_t number_gpus) noexcept { // TODO add round robin and min strategy - return recycler::get_device_id(number_gpus); + return cppuddle::get_device_id(number_gpus); } template @@ -195,11 +195,11 @@ class stream_pool { /// Deprecated! Use init_on_all_gpu or init_on_gpu template static void init(size_t number_of_streams, Ts ... executor_args) { - /* static_assert(sizeof...(Ts) == sizeof...(Ts) && recycler::max_number_gpus == 1, */ + /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ /* "deprecated stream_pool::init does not support multigpu"); */ auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); instance().streampools.emplace_back(number_of_streams, executor_args...); - assert(instance().streampools.size() <= recycler::max_number_gpus); + assert(instance().streampools.size() <= cppuddle::max_number_gpus); } /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments @@ -207,13 +207,13 @@ class stream_pool { static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { - for (size_t gpu_id = 0; gpu_id < recycler::max_number_gpus; gpu_id++) { + for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { instance().select_gpu_function(gpu_id); instance().streampools.emplace_back(number_of_streams, executor_args...); } } - assert(instance().streampools.size() <= recycler::max_number_gpus); + assert(instance().streampools.size() <= cppuddle::max_number_gpus); } /// Per-GPU init allowing for different init parameters depending on the GPU @@ -226,40 +226,40 @@ class stream_pool { instance().streampools.emplace_back(number_of_streams, executor_args...); } - assert(instance().streampools.size() <= recycler::max_number_gpus); + assert(instance().streampools.size() <= cppuddle::max_number_gpus); } // TODO add/rename into finalize? static void cleanup() { auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); - assert(instance().streampools.size() == recycler::max_number_gpus); + assert(instance().streampools.size() == cppuddle::max_number_gpus); instance().streampools.clear(); } static std::tuple get_interface(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].get_interface(); } static void release_interface(size_t index, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); assert(gpu_id < instance().streampools.size()); instance().streampools[gpu_id].release_interface(index); } static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].interface_available(load_limit); } static size_t get_current_load(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); assert(gpu_id < instance().streampools.size()); return instance().streampools[gpu_id].get_current_load(); } // TODO deprecated! Remove... /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ - /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ - /* assert(instance().streampools.size() == recycler::max_number_gpus); */ + /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ + /* assert(instance().streampools.size() == cppuddle::max_number_gpus); */ /* return instance().streampools[gpu_id].get_next_device_id(); */ /* } */ @@ -274,15 +274,15 @@ class stream_pool { private: stream_pool_implementation() = default; - recycler::mutex_t pool_mut{}; + cppuddle::mutex_t pool_mut{}; std::function select_gpu_function = [](size_t gpu_id) { // By default no multi gpu support - assert(recycler::max_number_gpus == 1 || instance().streampools.size() == 1); + assert(cppuddle::max_number_gpus == 1 || instance().streampools.size() == 1); assert(gpu_id == 0); }; std::deque streampools{}; - std::array gpu_mutexes; + std::array gpu_mutexes; static stream_pool_implementation& instance(void) { static stream_pool_implementation pool_instance{}; diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index f7971e1c..4da36df9 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -12,22 +12,27 @@ namespace recycler { namespace detail { -[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] -template -using sycl_host_default_allocator = cppuddle::detail::sycl_host_default_allocator; +template +using sycl_host_default_allocator + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::detail::sycl_host_default_allocator; -[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] -template -using sycl_device_default_allocator = cppuddle::detail::sycl_device_default_allocator; +template +using sycl_device_default_allocator + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::detail::sycl_device_default_allocator; } // end namespace detail -[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] template ::value, int> = 0> -using recycle_allocator_sycl_host = cppuddle::recycle_allocator_sycl_host; -[[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] +using recycle_allocator_sycl_host + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_sycl_host; + template ::value, int> = 0> -using recycle_allocator_sycl_device = cppuddle::recycle_allocator_sycl_device; +using recycle_allocator_sycl_device + [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = + cppuddle::recycle_allocator_sycl_device; } // end namespace recycler #endif diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index 47fc83f4..2ac9ea17 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -21,6 +21,7 @@ #include #include +#include "std_recycling_allocators.hpp" #include "cuda_recycling_allocators.hpp" #include "recycling_kokkos_view.hpp" @@ -36,8 +37,8 @@ template using kokkos_um_array = Kokkos::View; template -using recycled_host_view = - cppuddle::recycled_view, cppuddle::recycle_std, T>; +using recycle_host_view = + cppuddle::recycle_view, cppuddle::recycle_std, T>; // Device views using recycle allocators @@ -45,8 +46,8 @@ template using kokkos_um_device_array = Kokkos::View; template -using recycled_device_view = - cppuddle::recycled_view, +using recycle_device_view = + cppuddle::recycle_view, cppuddle::recycle_allocator_cuda_device, T>; // Host views using pinned memory recycle allocators @@ -55,8 +56,8 @@ using kokkos_um_pinned_array = Kokkos::View::array_layout, Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>; template -using recycled_pinned_view = - cppuddle::recycled_view, +using recycle_pinned_view = + cppuddle::recycle_view, cppuddle::recycle_allocator_cuda_host, T>; template @@ -81,7 +82,7 @@ int main(int argc, char *argv[]) { // Host run for (size_t pass = 0; pass < passes; pass++) { // Create view - recycled_host_view hostView(view_size_0, view_size_1); + recycle_host_view hostView(view_size_0, view_size_1); // Create executor hpx::kokkos::serial_executor executor; @@ -109,7 +110,7 @@ int main(int argc, char *argv[]) { // Device run for (size_t pass = 0; pass < passes; pass++) { // Create and init host view - recycled_pinned_view hostView(view_size_0, view_size_1); + recycle_pinned_view hostView(view_size_0, view_size_1); for(size_t i = 0; i < view_size_0; i++) { for(size_t j = 0; j < view_size_1; j++) { hostView(i, j) = 1.0; @@ -120,7 +121,7 @@ int main(int argc, char *argv[]) { hpx::kokkos::cuda_executor executor(hpx::kokkos::execution_space_mode::independent); // Use executor to move the host data to the device - recycled_device_view deviceView(view_size_0, view_size_1); + recycle_device_view deviceView(view_size_0, view_size_1); Kokkos::deep_copy(executor.instance(), deviceView, hostView); auto policy_1 = Kokkos::Experimental::require( diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index c8045d3e..4826efec 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -21,6 +21,7 @@ #include #include +#include "std_recycling_allocators.hpp" #include "cuda_recycling_allocators.hpp" #include "recycling_kokkos_view.hpp" @@ -32,8 +33,8 @@ template using kokkos_um_array = Kokkos::View; template -using recycled_host_view = - cppuddle::recycled_view, cppuddle::recycle_std, T>; +using recycle_host_view = + cppuddle::recycle_view, cppuddle::recycle_std, T>; #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { @@ -74,8 +75,8 @@ int main(int argc, char *argv[]) { hpx::kokkos::ScopeGuard scopeGuard(argc, argv); Kokkos::print_configuration(std::cout); - using test_view = recycled_host_view; - using test_double_view = recycled_host_view; + using test_view = recycle_host_view; + using test_double_view = recycle_host_view; constexpr size_t passes = 100; for (size_t pass = 0; pass < passes; pass++) { From d5a3f79b9c14b870fce4ba6e4b7634be92f9dc5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Thu, 7 Mar 2024 17:37:31 +0100 Subject: [PATCH 06/19] Rework allocator namespace 4 --- include/aggregation_manager.hpp | 10 ++-- include/aligned_buffer_util.hpp | 4 +- include/aligned_recycling_allocators.hpp | 3 ++ include/buffer_management_interface.hpp | 15 ++++-- include/buffer_manager.hpp | 22 ++++---- include/cuda_buffer_util.hpp | 12 ++--- include/cuda_recycling_allocators.hpp | 5 +- include/detail/buffer_management.hpp | 10 ++-- include/hip_buffer_util.hpp | 12 ++--- include/hip_recycling_allocators.hpp | 4 +- include/recycling_kokkos_view.hpp | 52 ++++++++++--------- include/std_recycling_allocators.hpp | 2 + include/sycl_recycling_allocators.hpp | 2 + tests/allocator_aligned_test.cpp | 21 ++++---- tests/allocator_hpx_test.cpp | 26 +++++----- ...llocator_kokkos_executor_for_loop_test.cpp | 21 ++++---- tests/allocator_kokkos_test.cpp | 6 +-- tests/allocator_test.cpp | 18 ++++--- 18 files changed, 138 insertions(+), 107 deletions(-) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 99468658..1cbe09db 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -594,8 +594,9 @@ template class Aggregated_Executor { // Buffer might be recycled from previous allocations by the // buffer_interface... T *aggregated_buffer = - cppuddle::detail::buffer_interface::get( - size, manage_content_lifetime, location_id, gpu_id); + cppuddle::memory_recycling::detail::buffer_interface::get< + T, Host_Allocator>(size, manage_content_lifetime, location_id, + gpu_id); // Create buffer entry for this buffer buffer_allocations.emplace_back(static_cast(aggregated_buffer), size, 1, true, location_id, gpu_id); @@ -670,8 +671,9 @@ template class Aggregated_Executor { // it as invalid) if (valid) { assert(buffers_in_use == true); - cppuddle::detail::buffer_interface::mark_unused( - buffer_pointer, buffer_size, location_id, gpu_id); + cppuddle::memory_recycling::detail::buffer_interface::mark_unused< + T, Host_Allocator>(buffer_pointer, buffer_size, location_id, + gpu_id); // mark buffer as invalid to prevent any other slice from marking the // buffer as unused valid = false; diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 0e5f1fb7..e4ef7990 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -14,13 +14,13 @@ template ::value, int> = 0> using recycle_aligned [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = - cppuddle::recycle_aligned; + cppuddle::memory_recycling::recycle_aligned; template ::value, int> = 0> using aggressive_recycle_aligned [[deprecated("Use from header aligned_recycling_allocators.hpp instead")]] = - cppuddle::aggressive_recycle_aligned; + cppuddle::memory_recycling::aggressive_recycle_aligned; } // namespace recycler diff --git a/include/aligned_recycling_allocators.hpp b/include/aligned_recycling_allocators.hpp index 039a19f2..ee0182bb 100644 --- a/include/aligned_recycling_allocators.hpp +++ b/include/aligned_recycling_allocators.hpp @@ -10,6 +10,7 @@ #include "buffer_management_interface.hpp" namespace cppuddle { +namespace memory_recycling { namespace device_selection { template /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / @@ -31,6 +32,8 @@ template ::value, int> = 0> using aggressive_recycle_aligned = detail::aggressive_recycle_allocator< T, boost::alignment::aligned_allocator>; + +} // namespace memory_recycling } // namespace cppuddle #endif diff --git a/include/buffer_management_interface.hpp b/include/buffer_management_interface.hpp index b38fc84c..8614568b 100644 --- a/include/buffer_management_interface.hpp +++ b/include/buffer_management_interface.hpp @@ -4,18 +4,25 @@ #include "detail/buffer_management.hpp" namespace cppuddle { +namespace memory_recycling { /// Print performance counters of all buffer managers to stdout -inline void print_buffer_counters() { detail::buffer_interface::print_performance_counters(); } -/// Deletes all buffers (even ones still marked as used), delete the buffer +inline void print_buffer_counters() { + detail::buffer_interface::print_performance_counters(); +} +/// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself inline void force_buffer_cleanup() { detail::buffer_interface::clean_all(); } + /// Deletes all buffers currently marked as unused -inline void unused_buffer_cleanup() { detail::buffer_interface::clean_unused_buffers(); } -/// Deletes all buffers (even ones still marked as used), delete the buffer +inline void unused_buffer_cleanup() { + detail::buffer_interface::clean_unused_buffers(); +} +/// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself. Disallows further usage. inline void finalize() { detail::buffer_interface::finalize(); } +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index d9d74c31..25e5ce00 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -9,29 +9,29 @@ namespace recycler { template ::value, int> = 0> using recycle_std [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = - cppuddle::recycle_std; + cppuddle::memory_recycling::recycle_std; template ::value, int> = 0> using aggressive_recycle_aligned [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = - cppuddle::aggressive_recycle_std; + cppuddle::memory_recycling::aggressive_recycle_std; -[[deprecated("Use cppuddle::print_buffer_counters() instead")]] +[[deprecated("Use cppuddle::memory_recycling::print_buffer_counters() instead")]] inline void print_performance_counters() { - cppuddle::print_buffer_counters(); + cppuddle::memory_recycling::print_buffer_counters(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself -[[deprecated("Use cppuddle::force_buffer_cleanup() instead")]] -inline void force_cleanup() { cppuddle::force_buffer_cleanup(); } +[[deprecated("Use cppuddle::memory_recycling::force_buffer_cleanup() instead")]] +inline void force_cleanup() { cppuddle::memory_recycling::force_buffer_cleanup(); } /// Deletes all buffers currently marked as unused -[[deprecated("Use cppuddle::unused_buffer_cleanup() instead")]] -inline void cleanup() { cppuddle::unused_buffer_cleanup(); } +[[deprecated("Use cppuddle::memory_recycling::unused_buffer_cleanup() instead")]] +inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); } /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself. Disallows further usage. -[[deprecated("Use cppuddle::finalize() instead")]] -inline void finalize() { cppuddle::finalize(); } +[[deprecated("Use cppuddle::memory_recycling::finalize() instead")]] +inline void finalize() { cppuddle::memory_recycling::finalize(); } -} // end namespace cppuddle +} // namespace recycler #endif diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index ffe47f8b..6334da8a 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -14,35 +14,35 @@ namespace detail { template using cuda_pinned_allocator [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::detail::cuda_pinned_allocator; + cppuddle::memory_recycling::detail::cuda_pinned_allocator; template using cuda_device_allocator [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::detail::cuda_device_allocator; + cppuddle::memory_recycling::detail::cuda_device_allocator; } // end namespace detail template ::value, int> = 0> using recycle_allocator_cuda_host [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_cuda_host; + cppuddle::memory_recycling::recycle_allocator_cuda_host; template ::value, int> = 0> using recycle_allocator_cuda_device [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_cuda_device; + cppuddle::memory_recycling::recycle_allocator_cuda_device; template ::value, int> = 0> using cuda_device_buffer [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::cuda_device_buffer; + cppuddle::memory_recycling::cuda_device_buffer; template ::value, int> = 0> using cuda_aggregated_device_buffer [[deprecated("Use from header cuda_recycling_allocators.hpp instead")]] = - cppuddle::cuda_aggregated_device_buffer; + cppuddle::memory_recycling::cuda_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/cuda_recycling_allocators.hpp b/include/cuda_recycling_allocators.hpp index b4cf8efb..911948a3 100644 --- a/include/cuda_recycling_allocators.hpp +++ b/include/cuda_recycling_allocators.hpp @@ -13,8 +13,9 @@ #include "buffer_management_interface.hpp" namespace cppuddle { -namespace detail { +namespace memory_recycling { +namespace detail { /// Underlying host allocator for CUDA pinned memory template struct cuda_pinned_allocator { using value_type = T; @@ -168,5 +169,7 @@ struct cuda_aggregated_device_buffer { Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence // for the entire lifetime of this buffer }; + +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/detail/buffer_management.hpp b/include/detail/buffer_management.hpp index 5d640983..98504d21 100644 --- a/include/detail/buffer_management.hpp +++ b/include/detail/buffer_management.hpp @@ -45,6 +45,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #include "config.hpp" namespace cppuddle { +namespace memory_recycling { namespace device_selection { /// Default device selector - No MultGPU support @@ -415,8 +416,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON // No unused buffer found -> Create new one and return it try { - cppuddle::device_selection::select_device_functor{}( - device_id); + cppuddle::memory_recycling::device_selection::select_device_functor< + T, Host_Allocator>{}(device_id); Host_Allocator alloc; T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( @@ -441,8 +442,8 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON // If there still isn't enough memory left, the caller has to handle it // We've done all we can in here Host_Allocator alloc; - cppuddle::device_selection::select_device_functor{}( - device_id); + cppuddle::memory_recycling::device_selection::select_device_functor< + T, Host_Allocator>{}(device_id); T *buffer = alloc.allocate(number_of_elements); instance()[location_id].buffer_map.insert( {buffer, std::make_tuple(buffer, number_of_elements, 1, @@ -922,6 +923,7 @@ operator!=(aggressive_recycle_allocator const &, return true; } } // namespace detail +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index a2b5ca0c..eadedc07 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -15,34 +15,34 @@ namespace detail { template using hip_pinned_allocator [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::detail::hip_pinned_allocator; + cppuddle::memory_recycling::detail::hip_pinned_allocator; template using hip_device_allocator [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::detail::hip_device_allocator; + cppuddle::memory_recycling::detail::hip_device_allocator; } // end namespace detail template ::value, int> = 0> using recycle_allocator_hip_host [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_hip_host; + cppuddle::memory_recycling::recycle_allocator_hip_host; template ::value, int> = 0> using recycle_allocator_hip_device [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_hip_device; + cppuddle::memory_recycling::recycle_allocator_hip_device; template ::value, int> = 0> using hip_device_buffer [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::hip_device_buffer; + cppuddle::memory_recycling::hip_device_buffer; template ::value, int> = 0> using hip_aggregated_device_buffer [[deprecated("Use from header hip_recycling_allocators.hpp instead")]] = - cppuddle::hip_aggregated_device_buffer; + cppuddle::memory_recycling::hip_aggregated_device_buffer; } // end namespace recycler #endif diff --git a/include/hip_recycling_allocators.hpp b/include/hip_recycling_allocators.hpp index f540b544..274fbb68 100644 --- a/include/hip_recycling_allocators.hpp +++ b/include/hip_recycling_allocators.hpp @@ -13,8 +13,9 @@ #include "buffer_management_interface.hpp" namespace cppuddle { -namespace detail { +namespace memory_recycling { +namespace detail { /// Underlying host allocator for HIP pinned memory template struct hip_pinned_allocator { using value_type = T; @@ -175,5 +176,6 @@ struct hip_aggregated_device_buffer { // for the entire lifetime of this buffer }; +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/recycling_kokkos_view.hpp b/include/recycling_kokkos_view.hpp index c55d3738..86085fc8 100644 --- a/include/recycling_kokkos_view.hpp +++ b/include/recycling_kokkos_view.hpp @@ -13,6 +13,8 @@ namespace cppuddle { +namespace memory_recycling { + template struct view_deleter { @@ -26,7 +28,7 @@ struct view_deleter { }; template -class aggregated_recycle_view : public kokkos_type { +class aggregated_recycling_view : public kokkos_type { private: alloc_type allocator; size_t total_elements{0}; @@ -36,7 +38,7 @@ class aggregated_recycle_view : public kokkos_type { public: using view_type = kokkos_type; template - explicit aggregated_recycle_view(alloc_type &alloc, Args... args) + explicit aggregated_recycling_view(alloc_type &alloc, Args... args) : kokkos_type( alloc.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -47,15 +49,15 @@ class aggregated_recycle_view : public kokkos_type { data_ref_counter(this->data(), view_deleter( alloc, total_elements)) {} - aggregated_recycle_view( - const aggregated_recycle_view &other) + aggregated_recycling_view( + const aggregated_recycling_view &other) : kokkos_type(other), allocator(other.allocator) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - aggregated_recycle_view & - operator=(const aggregated_recycle_view &other) { + aggregated_recycling_view & + operator=(const aggregated_recycling_view &other) { data_ref_counter = other.data_ref_counter; allocator = other.allocator; kokkos_type::operator=(other); @@ -63,15 +65,15 @@ class aggregated_recycle_view : public kokkos_type { return *this; } - aggregated_recycle_view( - aggregated_recycle_view &&other) noexcept + aggregated_recycling_view( + aggregated_recycling_view &&other) noexcept : kokkos_type(other), allocator(other.allocator) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - aggregated_recycle_view &operator=( - aggregated_recycle_view &&other) noexcept { + aggregated_recycling_view &operator=( + aggregated_recycling_view &&other) noexcept { data_ref_counter = other.data_ref_counter; allocator = other.allocator; kokkos_type::operator=(other); @@ -79,12 +81,12 @@ class aggregated_recycle_view : public kokkos_type { return *this; } - ~aggregated_recycle_view() {} + ~aggregated_recycling_view() {} }; template -class recycle_view : public kokkos_type { +class recycling_view : public kokkos_type { private: size_t total_elements{0}; std::shared_ptr data_ref_counter; @@ -94,7 +96,7 @@ class recycle_view : public kokkos_type { static_assert(std::is_same_v); template = true> - recycle_view(Args... args) + recycling_view(Args... args) : kokkos_type( alloc_type{}.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -106,7 +108,7 @@ class recycle_view : public kokkos_type { template = true> - recycle_view(const size_t device_id, Args... args) + recycling_view(const size_t device_id, Args... args) : kokkos_type( alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(args...) / sizeof(element_type)), @@ -119,7 +121,7 @@ class recycle_view : public kokkos_type { template < typename layout_t, std::enable_if_t::value, bool> = true> - recycle_view(std::size_t device_id, layout_t layout) + recycling_view(std::size_t device_id, layout_t layout) : kokkos_type( alloc_type{device_id}.allocate(kokkos_type::required_allocation_size(layout) / sizeof(element_type)), @@ -129,41 +131,41 @@ class recycle_view : public kokkos_type { data_ref_counter(this->data(), view_deleter( alloc_type{device_id}, total_elements)) {} - recycle_view( - const recycle_view &other) + recycling_view( + const recycling_view &other) : kokkos_type(other) { total_elements = other.total_elements; data_ref_counter = other.data_ref_counter; } - recycle_view & - operator=(const recycle_view &other) { + recycling_view & + operator=(const recycling_view &other) { data_ref_counter = other.data_ref_counter; kokkos_type::operator=(other); total_elements = other.total_elements; return *this; } - recycle_view( - recycle_view &&other) noexcept + recycling_view( + recycling_view &&other) noexcept : kokkos_type(other) { data_ref_counter = other.data_ref_counter; total_elements = other.total_elements; } - recycle_view &operator=( - recycle_view &&other) noexcept { + recycling_view &operator=( + recycling_view &&other) noexcept { data_ref_counter = other.data_ref_counter; kokkos_type::operator=(other); total_elements = other.total_elements; return *this; } - ~recycle_view() { } + ~recycling_view() { } }; - +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/std_recycling_allocators.hpp b/include/std_recycling_allocators.hpp index a62390dd..141b0874 100644 --- a/include/std_recycling_allocators.hpp +++ b/include/std_recycling_allocators.hpp @@ -9,6 +9,7 @@ #include "buffer_management_interface.hpp" namespace cppuddle { +namespace memory_recycling { namespace device_selection { /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / /// select_device_functor does not compile for > 1 GPU (to make sure all / @@ -27,6 +28,7 @@ template ::value, int> = 0> using aggressive_recycle_std = detail::aggressive_recycle_allocator>; +} // namespace memory_recycling } // namespace cppuddle #endif diff --git a/include/sycl_recycling_allocators.hpp b/include/sycl_recycling_allocators.hpp index 66ba1fb8..c4f47c31 100644 --- a/include/sycl_recycling_allocators.hpp +++ b/include/sycl_recycling_allocators.hpp @@ -13,6 +13,7 @@ #include "buffer_management_interface.hpp" namespace cppuddle { +namespace memory_recycling { namespace device_selection { // No MutliGPU support yet, hence no select_device_function required @@ -85,5 +86,6 @@ template ::value, int> = 0> using recycle_allocator_sycl_device = detail::recycle_allocator>; +} // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 65d1df64..510ac9cd 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -78,7 +78,9 @@ int main(int argc, char *argv[]) { << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> + std::vector< + double, + cppuddle::memory_recycling::aggressive_recycle_aligned> test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += @@ -91,8 +93,8 @@ int main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Recycle Test: @@ -100,8 +102,9 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( - array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += std::chrono::duration_cast(end - begin) @@ -113,8 +116,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // Same test using std::allocator: @@ -123,7 +126,7 @@ int main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); std::vector> - test2(array_size, double{}); + test2(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); default_duration += std::chrono::duration_cast(end - begin) @@ -145,7 +148,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - cppuddle::print_buffer_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index 4af0878c..b94e305e 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -112,8 +112,8 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6(array_size, - double{}); + std::vector> + test6(array_size, double{}); }); } } @@ -126,20 +126,20 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison // ensure that at least 4 buffers have to created for unit testing { - std::vector> buffer1( + std::vector> buffer1( array_size, double{}); - std::vector> buffer2( + std::vector> buffer2( array_size, double{}); - std::vector> buffer3( + std::vector> buffer3( array_size, double{}); - std::vector> buffer4( + std::vector> buffer4( array_size, double{}); } @@ -153,8 +153,10 @@ int hpx_main(int argc, char *argv[]) { for (size_t pass = 0; pass < passes; pass++) { for (size_t i = 0; i < number_futures; i++) { futs[i] = futs[i].then([&](hpx::shared_future &&predecessor) { - std::vector> test6( - array_size, double{}); + std::vector< + double, + cppuddle::memory_recycling::aggressive_recycle_std> + test6(array_size, double{}); }); } } @@ -167,8 +169,8 @@ int hpx_main(int argc, char *argv[]) { std::cout << "\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for better + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for better // comparison diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index 2ac9ea17..439eb374 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -37,18 +37,17 @@ template using kokkos_um_array = Kokkos::View; template -using recycle_host_view = - cppuddle::recycle_view, cppuddle::recycle_std, T>; - +using recycle_host_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_array, cppuddle::memory_recycling::recycle_std, T>; // Device views using recycle allocators template using kokkos_um_device_array = Kokkos::View; template -using recycle_device_view = - cppuddle::recycle_view, - cppuddle::recycle_allocator_cuda_device, T>; +using recycle_device_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_device_array, + cppuddle::memory_recycling::recycle_allocator_cuda_device, T>; // Host views using pinned memory recycle allocators template @@ -56,9 +55,9 @@ using kokkos_um_pinned_array = Kokkos::View::array_layout, Kokkos::CudaHostPinnedSpace, Kokkos::MemoryUnmanaged>; template -using recycle_pinned_view = - cppuddle::recycle_view, - cppuddle::recycle_allocator_cuda_host, T>; +using recycle_pinned_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_pinned_array, + cppuddle::memory_recycling::recycle_allocator_cuda_host, T>; template auto get_iteration_policy(const Executor &&executor, @@ -144,11 +143,11 @@ int main(int argc, char *argv[]) { // otherwise the HPX cuda polling futures won't work hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); - cppuddle::print_buffer_counters(); + cppuddle::memory_recycling::print_buffer_counters(); // Cleanup all cuda views // (otherwise the cuda driver might shut down before this gets done automatically at // the end of the programm) - cppuddle::force_buffer_cleanup(); + cppuddle::memory_recycling::force_buffer_cleanup(); return hpx::finalize(); } diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index 4826efec..b513289f 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -33,8 +33,8 @@ template using kokkos_um_array = Kokkos::View; template -using recycle_host_view = - cppuddle::recycle_view, cppuddle::recycle_std, T>; +using recycle_host_view = cppuddle::memory_recycling::recycling_view< + kokkos_um_array, cppuddle::memory_recycling::recycle_std, T>; #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { @@ -92,7 +92,7 @@ int main(int argc, char *argv[]) { }); Kokkos::fence(); } - cppuddle::print_buffer_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 8fc7c5bb..8e13a6df 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -77,8 +77,9 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with aggressive recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1( - array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); aggressive_duration += std::chrono::duration_cast(end - begin) @@ -89,8 +90,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Aggressive recycle allocation test took " << aggressive_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for // better comparison // Recycle Test: @@ -98,7 +99,8 @@ int main(int argc, char *argv[]) { std::cout << "\nStarting run with recycle allocator: " << std::endl; for (size_t pass = 0; pass < passes; pass++) { auto begin = std::chrono::high_resolution_clock::now(); - std::vector> test1(array_size, double{}); + std::vector> + test1(array_size, double{}); auto end = std::chrono::high_resolution_clock::now(); recycle_duration += std::chrono::duration_cast(end - begin) @@ -109,8 +111,8 @@ int main(int argc, char *argv[]) { std::cout << "\n\n==> Recycle allocation test took " << recycle_duration << "ms" << std::endl; } - cppuddle::print_buffer_counters(); - cppuddle::force_buffer_cleanup(); // Cleanup all buffers and the managers for + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers for // better comparison // Same test using std::allocator: @@ -139,7 +141,7 @@ int main(int argc, char *argv[]) { std::cout << "Test information: Aggressive recycler was faster than default allocator!" << std::endl; } - cppuddle::print_buffer_counters(); + cppuddle::memory_recycling::print_buffer_counters(); #ifdef CPPUDDLE_HAVE_HPX return hpx::finalize(); #else From 38925759dda5bc75018a4d642042fb19d37115ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 8 Mar 2024 01:17:01 +0100 Subject: [PATCH 07/19] Move memory headers into new directory --- include/aggregation_manager.hpp | 2 +- include/aligned_buffer_util.hpp | 2 +- include/buffer_manager.hpp | 6 +++--- .../{detail => cppuddle/common}/config.hpp | 0 .../aligned_recycling_allocators.hpp | 1 + .../buffer_management_interface.hpp | 0 .../cuda_recycling_allocators.hpp | 0 .../detail/buffer_management.hpp | 2 +- .../hip_recycling_allocators.hpp | 0 .../recycling_kokkos_view.hpp | 0 .../std_recycling_allocators.hpp | 1 + .../sycl_recycling_allocators.hpp | 0 include/cuda_buffer_util.hpp | 3 ++- include/hip_buffer_util.hpp | 2 +- include/kokkos_buffer_util.hpp | 2 +- include/stream_manager.hpp | 20 +++++++++++++------ include/sycl_buffer_util.hpp | 2 +- tests/allocator_aligned_test.cpp | 3 ++- tests/allocator_hpx_test.cpp | 2 +- ...llocator_kokkos_executor_for_loop_test.cpp | 6 +++--- tests/allocator_kokkos_test.cpp | 6 +++--- tests/allocator_test.cpp | 2 +- tests/stream_test.hpp | 4 ++-- 23 files changed, 39 insertions(+), 27 deletions(-) rename include/{detail => cppuddle/common}/config.hpp (100%) rename include/{ => cppuddle/memory_recycling}/aligned_recycling_allocators.hpp (99%) rename include/{ => cppuddle/memory_recycling}/buffer_management_interface.hpp (100%) rename include/{ => cppuddle/memory_recycling}/cuda_recycling_allocators.hpp (100%) rename include/{ => cppuddle/memory_recycling}/detail/buffer_management.hpp (99%) rename include/{ => cppuddle/memory_recycling}/hip_recycling_allocators.hpp (100%) rename include/{ => cppuddle/memory_recycling}/recycling_kokkos_view.hpp (100%) rename include/{ => cppuddle/memory_recycling}/std_recycling_allocators.hpp (99%) rename include/{ => cppuddle/memory_recycling}/sycl_recycling_allocators.hpp (100%) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 1cbe09db..9e40797c 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -50,7 +50,7 @@ #include "../include/buffer_manager.hpp" #include "../include/stream_manager.hpp" -#include "../include/detail/config.hpp" +#include "cppuddle/common/config.hpp" #ifndef CPPUDDLE_HAVE_HPX_MUTEX #pragma message \ diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index e4ef7990..84b9be19 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -6,7 +6,7 @@ #ifndef ALIGNED_BUFFER_UTIL_HPP #define ALIGNED_BUFFER_UTIL_HPP -#include "aligned_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp" namespace recycler { diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 25e5ce00..186808d2 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -1,8 +1,8 @@ -#ifndef BUFFER_MANAGER_INTERFACE_HPP +#ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP -#include "buffer_management_interface.hpp" -#include "std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/buffer_management_interface.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" namespace recycler { diff --git a/include/detail/config.hpp b/include/cppuddle/common/config.hpp similarity index 100% rename from include/detail/config.hpp rename to include/cppuddle/common/config.hpp diff --git a/include/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp similarity index 99% rename from include/aligned_recycling_allocators.hpp rename to include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp index ee0182bb..8a9df8ec 100644 --- a/include/aligned_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp @@ -11,6 +11,7 @@ namespace cppuddle { namespace memory_recycling { + namespace device_selection { template /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / diff --git a/include/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp similarity index 100% rename from include/buffer_management_interface.hpp rename to include/cppuddle/memory_recycling/buffer_management_interface.hpp diff --git a/include/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp similarity index 100% rename from include/cuda_recycling_allocators.hpp rename to include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp diff --git a/include/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp similarity index 99% rename from include/detail/buffer_management.hpp rename to include/cppuddle/memory_recycling/detail/buffer_management.hpp index 98504d21..6d95ab8f 100644 --- a/include/detail/buffer_management.hpp +++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp @@ -42,7 +42,7 @@ For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATOR #endif #endif -#include "config.hpp" +#include "cppuddle/common/config.hpp" namespace cppuddle { namespace memory_recycling { diff --git a/include/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp similarity index 100% rename from include/hip_recycling_allocators.hpp rename to include/cppuddle/memory_recycling/hip_recycling_allocators.hpp diff --git a/include/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp similarity index 100% rename from include/recycling_kokkos_view.hpp rename to include/cppuddle/memory_recycling/recycling_kokkos_view.hpp diff --git a/include/std_recycling_allocators.hpp b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp similarity index 99% rename from include/std_recycling_allocators.hpp rename to include/cppuddle/memory_recycling/std_recycling_allocators.hpp index 141b0874..21fd5c2c 100644 --- a/include/std_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/std_recycling_allocators.hpp @@ -10,6 +10,7 @@ namespace cppuddle { namespace memory_recycling { + namespace device_selection { /// Dummy GPU selector. Needs to be defined for MultiGPU builds as the default / /// select_device_functor does not compile for > 1 GPU (to make sure all / diff --git a/include/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp similarity index 100% rename from include/sycl_recycling_allocators.hpp rename to include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 6334da8a..7589993d 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -6,7 +6,8 @@ #ifndef CUDA_BUFFER_UTIL_HPP #define CUDA_BUFFER_UTIL_HPP -#include "cuda_recycling_allocators.hpp" +#include "buffer_manager.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" namespace recycler { namespace detail { diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index eadedc07..2912666f 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -6,7 +6,7 @@ #ifndef HIP_BUFFER_UTIL_HPP #define HIP_BUFFER_UTIL_HPP -#include "hip_recycling_allocators.hpp" +#include "/cppuddle/memory_recycling/hip_recycling_allocators.hpp" namespace recycler { diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 7b267619..fc66e539 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -5,7 +5,7 @@ #ifndef KOKKOS_BUFFER_UTIL_HPP #define KOKKOS_BUFFER_UTIL_HPP -#include "recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" namespace recycler { diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index bfaba518..4d681e1a 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -17,7 +17,7 @@ #include #include -#include "../include/detail/config.hpp" +#include "cppuddle/common/config.hpp" // Need to cuda/hip definitions for default params when NOT // drawing from an executor pool @@ -39,6 +39,10 @@ enum class execution_space_mode { global, independent }; #endif #endif +/* namespace cppuddle { */ +/* namespace executor_recycling { */ + +namespace detail { /// Turns a std::array_mutex into an scoped lock template auto make_scoped_lock_from_array(mutex_array_t& mutexes) @@ -46,6 +50,7 @@ auto make_scoped_lock_from_array(mutex_array_t& mutexes) return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, mutexes); } +} // namespace detail template class round_robin_pool { private: @@ -197,7 +202,7 @@ class stream_pool { static void init(size_t number_of_streams, Ts ... executor_args) { /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ /* "deprecated stream_pool::init does not support multigpu"); */ - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); instance().streampools.emplace_back(number_of_streams, executor_args...); assert(instance().streampools.size() <= cppuddle::max_number_gpus); } @@ -205,7 +210,7 @@ class stream_pool { /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments template static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { instance().select_gpu_function(gpu_id); @@ -220,7 +225,7 @@ class stream_pool { /// (useful for executor that expect an GPU-id during construction) template static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_streams > 0) { instance().select_gpu_function(gpu_id); instance().streampools.emplace_back(number_of_streams, @@ -231,7 +236,7 @@ class stream_pool { // TODO add/rename into finalize? static void cleanup() { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); assert(instance().streampools.size() == cppuddle::max_number_gpus); instance().streampools.clear(); } @@ -264,7 +269,7 @@ class stream_pool { /* } */ static void set_device_selector(std::function select_gpu_function) { - auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); instance().select_gpu_function = select_gpu_function; } @@ -410,4 +415,7 @@ template class stream_interface { }; #endif +/* } // namespace executor_recycling */ +/* } // namespace cppuddle */ + #endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 4da36df9..7c88a4df 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -6,7 +6,7 @@ #ifndef SYCL_BUFFER_UTIL_HPP #define SYCL_BUFFER_UTIL_HPP -#include "sycl_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/sycl_recycling_allocators.hpp" namespace recycler { diff --git a/tests/allocator_aligned_test.cpp b/tests/allocator_aligned_test.cpp index 510ac9cd..ea9ce9a4 100644 --- a/tests/allocator_aligned_test.cpp +++ b/tests/allocator_aligned_test.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "../include/aligned_recycling_allocators.hpp" #ifdef CPPUDDLE_HAVE_HPX #include #endif @@ -17,6 +16,8 @@ #include #include +#include "cppuddle/memory_recycling/aligned_recycling_allocators.hpp" + #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { #else diff --git a/tests/allocator_hpx_test.cpp b/tests/allocator_hpx_test.cpp index b94e305e..21c4baed 100644 --- a/tests/allocator_hpx_test.cpp +++ b/tests/allocator_hpx_test.cpp @@ -15,7 +15,7 @@ #include -#include "std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" int hpx_main(int argc, char *argv[]) { diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index 439eb374..c38294d7 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -21,9 +21,9 @@ #include #include -#include "std_recycling_allocators.hpp" -#include "cuda_recycling_allocators.hpp" -#include "recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" // Assert during Release builds as well for this file: #undef NDEBUG diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index b513289f..5fb780e5 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -21,9 +21,9 @@ #include #include -#include "std_recycling_allocators.hpp" -#include "cuda_recycling_allocators.hpp" -#include "recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" using kokkos_array = Kokkos::View; diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index 8e13a6df..9a44664f 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -16,7 +16,7 @@ #include #include -#include "std_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" #ifdef CPPUDDLE_HAVE_HPX int hpx_main(int argc, char *argv[]) { diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index 07de4c44..2cfc5b07 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -9,8 +9,8 @@ #include #include #include -#include "../include/buffer_manager.hpp" -#include "../include/cuda_buffer_util.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cuda_buffer_util.hpp" template void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) { From e2da83dd51511a82985659fedc5574df027e027c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 8 Mar 2024 02:50:12 +0100 Subject: [PATCH 08/19] Executor pool interface refactoring --- include/aggregation_manager.hpp | 2 +- include/aligned_buffer_util.hpp | 2 +- include/buffer_manager.hpp | 5 + include/cppuddle/common/config.hpp | 2 +- .../executor_pools_management.hpp | 421 +++++++++++++++++ .../aligned_recycling_allocators.hpp | 2 +- .../buffer_management_interface.hpp | 5 + .../detail/buffer_management.hpp | 2 +- .../hip_recycling_allocators.hpp | 2 +- .../recycling_kokkos_view.hpp | 2 +- .../sycl_recycling_allocators.hpp | 2 +- include/cuda_buffer_util.hpp | 2 +- include/hip_buffer_util.hpp | 2 +- include/kokkos_buffer_util.hpp | 2 +- include/stream_manager.hpp | 437 +----------------- include/sycl_buffer_util.hpp | 2 +- tests/stream_test.cpp | 48 +- tests/stream_test.hpp | 168 ++++--- 18 files changed, 606 insertions(+), 502 deletions(-) create mode 100644 include/cppuddle/executor_recycling/executor_pools_management.hpp diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 9e40797c..70acfd61 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022-2023 Gregor Daiß +// Copyright (c) 2022-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 84b9be19..02a57104 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index 186808d2..fb253990 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -1,3 +1,8 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + #ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP diff --git a/include/cppuddle/common/config.hpp b/include/cppuddle/common/config.hpp index 7115c790..c9a5f736 100644 --- a/include/cppuddle/common/config.hpp +++ b/include/cppuddle/common/config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2023-2023 Gregor Daiß +// Copyright (c) 2023-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/executor_recycling/executor_pools_management.hpp b/include/cppuddle/executor_recycling/executor_pools_management.hpp new file mode 100644 index 00000000..16776031 --- /dev/null +++ b/include/cppuddle/executor_recycling/executor_pools_management.hpp @@ -0,0 +1,421 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef EXECUTOR_POOLS_MANAGEMENT_HPP +#define EXECUTOR_POOLS_MANAGEMENT_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cppuddle/common/config.hpp" + +// Need to cuda/hip definitions for default params when NOT +// drawing from an executor pool +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +#include +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) +#include +#endif +#endif + +// Redefintion required for non-recycling executors +// Without it, default constructing the executors (independent) would not work +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) +// Do only define if Kokkos is not found +#ifndef KOKKOS_ENABLE_SERIAL +namespace hpx { namespace kokkos { +enum class execution_space_mode { global, independent }; +}} +#endif +#endif + +namespace cppuddle { +namespace executor_recycling { + +namespace detail { +/// Turns a std::array_mutex into an scoped lock +template +auto make_scoped_lock_from_array(mutex_array_t& mutexes) +{ + return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, + mutexes); +} +} // namespace detail + +template class round_robin_pool_impl { +private: + std::deque pool{}; + std::vector ref_counters{}; + size_t current_interface{0}; + +public: + template + round_robin_pool_impl(size_t number_of_executors, Ts... executor_args) { + ref_counters.reserve(number_of_executors); + for (int i = 0; i < number_of_executors; i++) { + pool.emplace_back(executor_args...); + ref_counters.emplace_back(0); + } + } + // return a tuple with the interface and its index (to release it later) + std::tuple get_interface() { + assert(!(pool.empty())); + size_t last_interface = current_interface; + current_interface = (current_interface + 1) % pool.size(); + ref_counters[last_interface]++; + std::tuple ret(pool[last_interface], last_interface); + return ret; + } + void release_interface(size_t index) { ref_counters[index]--; } + bool interface_available(size_t load_limit) { + return *(std::min_element(std::begin(ref_counters), + std::end(ref_counters))) < load_limit; + } + size_t get_current_load() { + return *( + std::min_element(std::begin(ref_counters), std::end(ref_counters))); + } + // TODO Remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ +}; + +template class priority_pool_impl { +private: + std::deque pool{}; + std::vector ref_counters{}; // Ref counters + std::vector priorities{}; // Ref counters +public: + template + priority_pool_impl(size_t number_of_executors, Ts... executor_args) { + ref_counters.reserve(number_of_executors); + priorities.reserve(number_of_executors); + for (auto i = 0; i < number_of_executors; i++) { + pool.emplace_back(executor_args...); + ref_counters.emplace_back(0); + priorities.emplace_back(i); + } + } + // return a tuple with the interface and its index (to release it later) + std::tuple get_interface() { + auto &interface = pool[priorities[0]]; + ref_counters[priorities[0]]++; + std::tuple ret(interface, priorities[0]); + std::make_heap(std::begin(priorities), std::end(priorities), + [this](const size_t &first, const size_t &second) -> bool { + return ref_counters[first] > ref_counters[second]; + }); + return ret; + } + void release_interface(size_t index) { + ref_counters[index]--; + std::make_heap(std::begin(priorities), std::end(priorities), + [this](const size_t &first, const size_t &second) -> bool { + return ref_counters[first] > ref_counters[second]; + }); + } + bool interface_available(size_t load_limit) { + return ref_counters[priorities[0]] < load_limit; + } + size_t get_current_load() { return ref_counters[priorities[0]]; } + // TODO remove + /* size_t get_next_device_id() { */ + /* return 0; // single gpu pool */ + /* } */ +}; + +/// Access/Concurrency Control for executor pool implementation +class executor_pool { +public: + template + static void init(size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init(number_of_executors, + executor_args...); + } + template + static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init_all_executor_pools(number_of_executors, + executor_args...); + } + template + static void init_executor_pool(size_t pool_id, size_t number_of_executors, Ts ... executor_args) { + executor_pool_implementation::init_executor_pool(pool_id, number_of_executors, + executor_args...); + } + template static void cleanup() { + executor_pool_implementation::cleanup(); + } + template + static std::tuple get_interface(const size_t gpu_id) { + return executor_pool_implementation::get_interface(gpu_id); + } + template + static void release_interface(size_t index, const size_t gpu_id) noexcept { + executor_pool_implementation::release_interface(index, + gpu_id); + } + template + static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept { + return executor_pool_implementation::interface_available( + load_limit, gpu_id); + } + template + static size_t get_current_load(const size_t gpu_id = 0) noexcept { + return executor_pool_implementation::get_current_load( + gpu_id); + } + template + static size_t get_next_device_id(const size_t number_gpus) noexcept { + // TODO add round robin and min strategy + return cppuddle::get_device_id(number_gpus); + } + + template + static void set_device_selector(std::function select_gpu_function) { + executor_pool_implementation::set_device_selector(select_gpu_function); + } + + template + static void select_device(size_t gpu_id) { + executor_pool_implementation::select_device(gpu_id); + } + +private: + executor_pool() = default; + +private: + template class executor_pool_implementation { + public: + /// Deprecated! Use init_on_all_gpu or init_on_gpu + template + static void init(size_t number_of_executors, Ts ... executor_args) { + /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ + /* "deprecated executor_pool::init does not support multigpu"); */ + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + instance().executorpools.emplace_back(number_of_executors, executor_args...); + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments + template + static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) { + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_executors > 0) { + for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { + instance().select_gpu_function(gpu_id); + instance().executorpools.emplace_back(number_of_executors, + executor_args...); + } + } + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + /// Per-GPU init allowing for different init parameters depending on the GPU + /// (useful for executor that expect an GPU-id during construction) + template + static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) { + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + if (number_of_executors > 0) { + instance().select_gpu_function(gpu_id); + instance().executorpools.emplace_back(number_of_executors, + executor_args...); + } + assert(instance().executorpools.size() <= cppuddle::max_number_gpus); + } + + // TODO add/rename into finalize? + static void cleanup() { + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + assert(instance().executorpools.size() == cppuddle::max_number_gpus); + instance().executorpools.clear(); + } + + static std::tuple get_interface(const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].get_interface(); + } + static void release_interface(size_t index, const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + instance().executorpools[gpu_id].release_interface(index); + } + static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].interface_available(load_limit); + } + static size_t get_current_load(const size_t gpu_id = 0) { + std::lock_guard guard(instance().gpu_mutexes[gpu_id]); + assert(gpu_id < instance().executorpools.size()); + return instance().executorpools[gpu_id].get_current_load(); + } + // TODO deprecated! Remove... + /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ + /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ + /* assert(instance().executorpools.size() == cppuddle::max_number_gpus); */ + /* return instance().executorpools[gpu_id].get_next_device_id(); */ + /* } */ + + static void set_device_selector(std::function select_gpu_function) { + auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + instance().select_gpu_function = select_gpu_function; + } + + static void select_device(size_t gpu_id) { + instance().select_gpu_function(gpu_id); + } + + private: + executor_pool_implementation() = default; + cppuddle::mutex_t pool_mut{}; + std::function select_gpu_function = [](size_t gpu_id) { + // By default no multi gpu support + assert(cppuddle::max_number_gpus == 1 || instance().executorpools.size() == 1); + assert(gpu_id == 0); + }; + + std::deque executorpools{}; + std::array gpu_mutexes; + + static executor_pool_implementation& instance(void) { + static executor_pool_implementation pool_instance{}; + return pool_instance; + } + + public: + ~executor_pool_implementation() = default; + // Bunch of constructors we don't need + executor_pool_implementation(executor_pool_implementation const &other) = + delete; + executor_pool_implementation & + operator=(executor_pool_implementation const &other) = delete; + executor_pool_implementation(executor_pool_implementation &&other) = delete; + executor_pool_implementation & + operator=(executor_pool_implementation &&other) = delete; + }; + +public: + ~executor_pool() = default; + // Bunch of constructors we don't need + executor_pool(executor_pool const &other) = delete; + executor_pool &operator=(executor_pool const &other) = delete; + executor_pool(executor_pool &&other) = delete; + executor_pool &operator=(executor_pool &&other) = delete; +}; + +#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) + +// Warn about suboptimal performance without recycling +#pragma message \ +"Warning: Building without executor recycling! Use only for performance testing! \ +For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!" + +/// Slow version of the executor_interface that does not draw its +/// executors (Interface) from the pool but creates them instead. +/// Only meant for performance comparisons and only works with cuda/kokkos executors +template class executor_interface { +public: + + template + explicit executor_interface(size_t gpu_id, + std::enable_if_t::value, size_t> = 0) + : gpu_id(gpu_id), interface(gpu_id) {} + template + explicit executor_interface(std::enable_if_t::value, size_t> = 0) + : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {} + + executor_interface(const executor_interface &other) = delete; + executor_interface &operator=(const executor_interface &other) = delete; + executor_interface(executor_interface &&other) = delete; + executor_interface &operator=(executor_interface &&other) = delete; + ~executor_interface() { + } + + template + inline decltype(auto) post(F &&f, Ts &&... ts) { + return interface.post(std::forward(f), std::forward(ts)...); + } + + template + inline decltype(auto) async_execute(F &&f, Ts &&... ts) { + return interface.async_execute(std::forward(f), std::forward(ts)...); + } + + inline decltype(auto) get_future() { + return interface.get_future(); + } + + // allow implict conversion + operator Interface &() { // NOLINT + return interface; + } + +private: + size_t gpu_id; + +public: + Interface interface; +}; +#else +/// Stream interface for RAII purposes +/// Draws executor from the executor pool and releases it upon +/// destruction +template class executor_interface { +public: + explicit executor_interface(size_t gpu_id) + : t(executor_pool::get_interface(gpu_id)), + interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} + + executor_interface(const executor_interface &other) = delete; + executor_interface &operator=(const executor_interface &other) = delete; + executor_interface(executor_interface &&other) = delete; + executor_interface &operator=(executor_interface &&other) = delete; + ~executor_interface() { + executor_pool::release_interface(interface_index, gpu_id); + } + + template + inline decltype(auto) post(F &&f, Ts &&... ts) { + return interface.post(std::forward(f), std::forward(ts)...); + } + + template + inline decltype(auto) async_execute(F &&f, Ts &&... ts) { + return interface.async_execute(std::forward(f), std::forward(ts)...); + } + + inline decltype(auto) get_future() { + return interface.get_future(); + } + + // allow implict conversion + operator Interface &() { // NOLINT + return interface; + } + +private: + std::tuple t; + size_t interface_index; + size_t gpu_id; + +public: + Interface &interface; +}; +#endif + +} // namespace executor_recycling +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp index 8a9df8ec..a824e7e0 100644 --- a/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/aligned_recycling_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/buffer_management_interface.hpp b/include/cppuddle/memory_recycling/buffer_management_interface.hpp index 8614568b..c5fa44cd 100644 --- a/include/cppuddle/memory_recycling/buffer_management_interface.hpp +++ b/include/cppuddle/memory_recycling/buffer_management_interface.hpp @@ -1,3 +1,8 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + #ifndef BUFFER_MANAGEMENT_INTERFACE_HPP #define BUFFER_MANAGEMENT_INTERFACE_HPP diff --git a/include/cppuddle/memory_recycling/detail/buffer_management.hpp b/include/cppuddle/memory_recycling/detail/buffer_management.hpp index 6d95ab8f..7c30c781 100644 --- a/include/cppuddle/memory_recycling/detail/buffer_management.hpp +++ b/include/cppuddle/memory_recycling/detail/buffer_management.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp index 274fbb68..d4b2da3c 100644 --- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c: 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp index 86085fc8..98ce2799 100644 --- a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp +++ b/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp index c4f47c31..233afe71 100644 --- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c: 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 7589993d..8d004bef 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 2912666f..9bc8ccc3 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c): 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index fc66e539..54736ebe 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 4d681e1a..940620d5 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2023 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -6,416 +6,29 @@ #ifndef STREAM_MANAGER_HPP #define STREAM_MANAGER_HPP -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cppuddle/common/config.hpp" - -// Need to cuda/hip definitions for default params when NOT -// drawing from an executor pool -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) -#include -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) -#include -#endif -#endif - -// Redefintion required for non-recycling executors -// Without it, default constructing the executors (independent) would not work -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) -// Do only define if Kokkos is not found -#ifndef KOKKOS_ENABLE_SERIAL -namespace hpx { namespace kokkos { -enum class execution_space_mode { global, independent }; -}} -#endif -#endif - -/* namespace cppuddle { */ -/* namespace executor_recycling { */ - -namespace detail { -/// Turns a std::array_mutex into an scoped lock -template -auto make_scoped_lock_from_array(mutex_array_t& mutexes) -{ - return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, - mutexes); -} -} // namespace detail - -template class round_robin_pool { -private: - std::deque pool{}; - std::vector ref_counters{}; - size_t current_interface{0}; - -public: - template - round_robin_pool(size_t number_of_streams, Ts... executor_args) { - ref_counters.reserve(number_of_streams); - for (int i = 0; i < number_of_streams; i++) { - pool.emplace_back(executor_args...); - ref_counters.emplace_back(0); - } - } - // return a tuple with the interface and its index (to release it later) - std::tuple get_interface() { - assert(!(pool.empty())); - size_t last_interface = current_interface; - current_interface = (current_interface + 1) % pool.size(); - ref_counters[last_interface]++; - std::tuple ret(pool[last_interface], last_interface); - return ret; - } - void release_interface(size_t index) { ref_counters[index]--; } - bool interface_available(size_t load_limit) { - return *(std::min_element(std::begin(ref_counters), - std::end(ref_counters))) < load_limit; - } - size_t get_current_load() { - return *( - std::min_element(std::begin(ref_counters), std::end(ref_counters))); - } - // TODO Remove - /* size_t get_next_device_id() { */ - /* return 0; // single gpu pool */ - /* } */ -}; - -template class priority_pool { -private: - std::deque pool{}; - std::vector ref_counters{}; // Ref counters - std::vector priorities{}; // Ref counters -public: - template - priority_pool(size_t number_of_streams, Ts... executor_args) { - ref_counters.reserve(number_of_streams); - priorities.reserve(number_of_streams); - for (auto i = 0; i < number_of_streams; i++) { - pool.emplace_back(executor_args...); - ref_counters.emplace_back(0); - priorities.emplace_back(i); - } - } - // return a tuple with the interface and its index (to release it later) - std::tuple get_interface() { - auto &interface = pool[priorities[0]]; - ref_counters[priorities[0]]++; - std::tuple ret(interface, priorities[0]); - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - return ret; - } - void release_interface(size_t index) { - ref_counters[index]--; - std::make_heap(std::begin(priorities), std::end(priorities), - [this](const size_t &first, const size_t &second) -> bool { - return ref_counters[first] > ref_counters[second]; - }); - } - bool interface_available(size_t load_limit) { - return ref_counters[priorities[0]] < load_limit; - } - size_t get_current_load() { return ref_counters[priorities[0]]; } - // TODO remove - /* size_t get_next_device_id() { */ - /* return 0; // single gpu pool */ - /* } */ -}; - -/// Access/Concurrency Control for stream pool implementation -class stream_pool { -public: - template - static void init(size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init(number_of_streams, - executor_args...); - } - template - static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init_all_executor_pools(number_of_streams, - executor_args...); - } - template - static void init_executor_pool(size_t pool_id, size_t number_of_streams, Ts ... executor_args) { - stream_pool_implementation::init_executor_pool(pool_id, number_of_streams, - executor_args...); - } - template static void cleanup() { - stream_pool_implementation::cleanup(); - } - template - static std::tuple get_interface(const size_t gpu_id) { - return stream_pool_implementation::get_interface(gpu_id); - } - template - static void release_interface(size_t index, const size_t gpu_id) noexcept { - stream_pool_implementation::release_interface(index, - gpu_id); - } - template - static bool interface_available(size_t load_limit, const size_t gpu_id) noexcept { - return stream_pool_implementation::interface_available( - load_limit, gpu_id); - } - template - static size_t get_current_load(const size_t gpu_id = 0) noexcept { - return stream_pool_implementation::get_current_load( - gpu_id); - } - template - static size_t get_next_device_id(const size_t number_gpus) noexcept { - // TODO add round robin and min strategy - return cppuddle::get_device_id(number_gpus); - } - - template - static void set_device_selector(std::function select_gpu_function) { - stream_pool_implementation::set_device_selector(select_gpu_function); - } - - template - static void select_device(size_t gpu_id) { - stream_pool_implementation::select_device(gpu_id); - } - -private: - stream_pool() = default; - -private: - template class stream_pool_implementation { - public: - /// Deprecated! Use init_on_all_gpu or init_on_gpu - template - static void init(size_t number_of_streams, Ts ... executor_args) { - /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ - /* "deprecated stream_pool::init does not support multigpu"); */ - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); - instance().streampools.emplace_back(number_of_streams, executor_args...); - assert(instance().streampools.size() <= cppuddle::max_number_gpus); - } - - /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments - template - static void init_all_executor_pools(size_t number_of_streams, Ts ... executor_args) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); - if (number_of_streams > 0) { - for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { - instance().select_gpu_function(gpu_id); - instance().streampools.emplace_back(number_of_streams, - executor_args...); - } - } - assert(instance().streampools.size() <= cppuddle::max_number_gpus); - } - - /// Per-GPU init allowing for different init parameters depending on the GPU - /// (useful for executor that expect an GPU-id during construction) - template - static void init_executor_pool(size_t gpu_id, size_t number_of_streams, Ts ... executor_args) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); - if (number_of_streams > 0) { - instance().select_gpu_function(gpu_id); - instance().streampools.emplace_back(number_of_streams, - executor_args...); - } - assert(instance().streampools.size() <= cppuddle::max_number_gpus); - } - - // TODO add/rename into finalize? - static void cleanup() { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); - assert(instance().streampools.size() == cppuddle::max_number_gpus); - instance().streampools.clear(); - } - - static std::tuple get_interface(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].get_interface(); - } - static void release_interface(size_t index, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - instance().streampools[gpu_id].release_interface(index); - } - static bool interface_available(size_t load_limit, const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].interface_available(load_limit); - } - static size_t get_current_load(const size_t gpu_id = 0) { - std::lock_guard guard(instance().gpu_mutexes[gpu_id]); - assert(gpu_id < instance().streampools.size()); - return instance().streampools[gpu_id].get_current_load(); - } - // TODO deprecated! Remove... - /* static size_t get_next_device_id(const size_t gpu_id = 0) { */ - /* std::lock_guard guard(instance().gpu_mutexes[gpu_id]); */ - /* assert(instance().streampools.size() == cppuddle::max_number_gpus); */ - /* return instance().streampools[gpu_id].get_next_device_id(); */ - /* } */ - - static void set_device_selector(std::function select_gpu_function) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); - instance().select_gpu_function = select_gpu_function; - } - - static void select_device(size_t gpu_id) { - instance().select_gpu_function(gpu_id); - } - - private: - stream_pool_implementation() = default; - cppuddle::mutex_t pool_mut{}; - std::function select_gpu_function = [](size_t gpu_id) { - // By default no multi gpu support - assert(cppuddle::max_number_gpus == 1 || instance().streampools.size() == 1); - assert(gpu_id == 0); - }; - - std::deque streampools{}; - std::array gpu_mutexes; - - static stream_pool_implementation& instance(void) { - static stream_pool_implementation pool_instance{}; - return pool_instance; - } - - public: - ~stream_pool_implementation() = default; - // Bunch of constructors we don't need - stream_pool_implementation(stream_pool_implementation const &other) = - delete; - stream_pool_implementation & - operator=(stream_pool_implementation const &other) = delete; - stream_pool_implementation(stream_pool_implementation &&other) = delete; - stream_pool_implementation & - operator=(stream_pool_implementation &&other) = delete; - }; - -public: - ~stream_pool() = default; - // Bunch of constructors we don't need - stream_pool(stream_pool const &other) = delete; - stream_pool &operator=(stream_pool const &other) = delete; - stream_pool(stream_pool &&other) = delete; - stream_pool &operator=(stream_pool &&other) = delete; -}; - -#if defined(CPPUDDLE_DEACTIVATE_EXECUTOR_RECYCLING) - -// Warn about suboptimal performance without recycling -#pragma message \ -"Warning: Building without executor recycling! Use only for performance testing! \ -For better performance configure CPPuddle with CPPUDDLE_WITH_EXECUTOR_RECYCLING=ON!" - -/// Slow version of the stream_interface that does not draw its -/// executors (Interface) from the pool but creates them instead. -/// Only meant for performance comparisons and only works with cuda/kokkos executors -template class stream_interface { -public: - - template - explicit stream_interface(size_t gpu_id, - std::enable_if_t::value, size_t> = 0) - : gpu_id(gpu_id), interface(gpu_id) {} - template - explicit stream_interface(std::enable_if_t::value, size_t> = 0) - : gpu_id(gpu_id), interface(hpx::kokkos::execution_space_mode::independent) {} - - stream_interface(const stream_interface &other) = delete; - stream_interface &operator=(const stream_interface &other) = delete; - stream_interface(stream_interface &&other) = delete; - stream_interface &operator=(stream_interface &&other) = delete; - ~stream_interface() { - } - - template - inline decltype(auto) post(F &&f, Ts &&... ts) { - return interface.post(std::forward(f), std::forward(ts)...); - } - - template - inline decltype(auto) async_execute(F &&f, Ts &&... ts) { - return interface.async_execute(std::forward(f), std::forward(ts)...); - } - - inline decltype(auto) get_future() { - return interface.get_future(); - } - - // allow implict conversion - operator Interface &() { // NOLINT - return interface; - } - -private: - size_t gpu_id; - -public: - Interface interface; -}; -#else -/// Stream interface for RAII purposes -/// Draws executor from the stream pool and releases it upon -/// destruction -template class stream_interface { -public: - explicit stream_interface(size_t gpu_id) - : t(stream_pool::get_interface(gpu_id)), - interface(std::get<0>(t)), interface_index(std::get<1>(t)), gpu_id(gpu_id) {} - - stream_interface(const stream_interface &other) = delete; - stream_interface &operator=(const stream_interface &other) = delete; - stream_interface(stream_interface &&other) = delete; - stream_interface &operator=(stream_interface &&other) = delete; - ~stream_interface() { - stream_pool::release_interface(interface_index, gpu_id); - } - - template - inline decltype(auto) post(F &&f, Ts &&... ts) { - return interface.post(std::forward(f), std::forward(ts)...); - } - - template - inline decltype(auto) async_execute(F &&f, Ts &&... ts) { - return interface.async_execute(std::forward(f), std::forward(ts)...); - } - - inline decltype(auto) get_future() { - return interface.get_future(); - } - - // allow implict conversion - operator Interface &() { // NOLINT - return interface; - } - -private: - std::tuple t; - size_t interface_index; - size_t gpu_id; - -public: - Interface &interface; -}; -#endif - -/* } // namespace executor_recycling */ -/* } // namespace cppuddle */ +#include "cppuddle/executor_recycling/executor_pools_management.hpp" + +template +using round_robin_pool + [[deprecated("Use cppuddle::executor_recycling::round_robin_pool_impl from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::round_robin_pool_impl; + +template +using priority_pool + [[deprecated("Use cppuddle::executor_recycling::priority_pool_impl from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::priority_pool_impl; + +using stream_pool + [[deprecated("Use cppuddle::executor_recycling::executor_pool from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::executor_pool; + +template +using stream_interface + [[deprecated("Use cppuddle::executor_recycling::executor_interface from " + "header executor_pools_management.hpp instead")]] = + cppuddle::executor_recycling::executor_interface; #endif diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 7c88a4df..ad64a9dc 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2021 Gregor Daiß +// Copyright (c: 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/tests/stream_test.cpp b/tests/stream_test.cpp index 96599759..2e3ebf4c 100644 --- a/tests/stream_test.cpp +++ b/tests/stream_test.cpp @@ -1,10 +1,9 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #define USE_HPX_MAIN -#include "../include/stream_manager.hpp" #include #ifdef USE_HPX_MAIN #include @@ -26,46 +25,49 @@ int main(int argc, char *argv[]) { #endif std::cout << "Starting ref counting tests ..." << std::endl; test_pool_ref_counting>( - 2, 0, false); - test_pool_ref_counting< - hpx::cuda::experimental::cuda_executor, - round_robin_pool>(2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, + false); + test_pool_ref_counting>(2, 0, + false); std::cout << "Finished ref counting tests!" << std::endl; - std::cout << "Starting wrapper objects tests ..." << std::endl; test_pool_wrappers>( - 2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); test_pool_wrappers>( - 2, 0, false); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); std::cout << "Finished wrapper objects tests!" << std::endl; std::cout << "Starting memcpy tests... " << std::endl; test_pool_memcpy>( - 2, 0, false); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); test_pool_memcpy>( - 2, 0, false); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, false); std::cout << "Finished memcpy tests! " << std::endl; std::cout << "Starting memcpy polling tests... " << std::endl; { // hpx::cuda::experimental::enable_user_polling polling_scope; - hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0)); + hpx::cuda::experimental::detail::register_polling( + hpx::resource::get_thread_pool(0)); test_pool_memcpy>( - 2, 0, true); + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, true); test_pool_memcpy>( - 2, 0, true); - hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); + cppuddle::executor_recycling::priority_pool_impl< + hpx::cuda::experimental::cuda_executor>>(2, 0, true); + hpx::cuda::experimental::detail::unregister_polling( + hpx::resource::get_thread_pool(0)); } - recycler::force_cleanup(); + cppuddle::memory_recycling::force_buffer_cleanup(); std::cout << "Finished memcpy tests! " << std::endl; return hpx::finalize(); } diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index 2cfc5b07..1dfa60db 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2021 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,122 +10,180 @@ #include #include #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" -#include "cuda_buffer_util.hpp" +#include "cppuddle/executor_recycling/executor_pools_management.hpp"" template -void test_pool_memcpy(const size_t stream_parameter, Ts &&... ts) { - std::vector> hostbuffer( - 512); - recycler::cuda_device_buffer devicebuffer(512); - stream_pool::init(stream_parameter, std::forward(ts)...); +void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) { + std::vector> + hostbuffer(512); + cppuddle::memory_recycling::cuda_device_buffer devicebuffer(512); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); // without interface wrapper { - auto test1 = stream_pool::get_interface(0); + auto test1 = + cppuddle::executor_recycling::executor_pool::get_interface(0); Interface test1_interface = std::get<0>(test1); size_t interface_id = std::get<1>(test1); - hpx::apply(test1_interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, - hostbuffer.data(), 512 * sizeof(double), - cudaMemcpyHostToDevice); - auto fut1 = hpx::async(test1_interface, - cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer, - 512 * sizeof(double), cudaMemcpyDeviceToHost); + hpx::apply(test1_interface, cudaMemcpyAsync, + devicebuffer.device_side_buffer, hostbuffer.data(), + 512 * sizeof(double), cudaMemcpyHostToDevice); + auto fut1 = hpx::async(test1_interface, cudaMemcpyAsync, hostbuffer.data(), + devicebuffer.device_side_buffer, + 512 * sizeof(double), cudaMemcpyDeviceToHost); fut1.get(); - stream_pool::release_interface(interface_id, 0); + cppuddle::executor_recycling::executor_pool::release_interface( + interface_id, 0); } // with interface wrapper { - stream_interface test1_interface{0}; + cppuddle::executor_recycling::executor_interface + test1_interface{0}; // hpx::cuda::cuda_executor test1_interface(0, false); - hpx::apply(test1_interface.interface, cudaMemcpyAsync, devicebuffer.device_side_buffer, - hostbuffer.data(), 512 * sizeof(double), - cudaMemcpyHostToDevice); - auto fut1 = hpx::async(test1_interface.interface, - cudaMemcpyAsync, hostbuffer.data(), devicebuffer.device_side_buffer, - 512 * sizeof(double), cudaMemcpyDeviceToHost); + hpx::apply(test1_interface.interface, cudaMemcpyAsync, + devicebuffer.device_side_buffer, hostbuffer.data(), + 512 * sizeof(double), cudaMemcpyHostToDevice); + auto fut1 = hpx::async(test1_interface.interface, cudaMemcpyAsync, + hostbuffer.data(), devicebuffer.device_side_buffer, + 512 * sizeof(double), cudaMemcpyDeviceToHost); fut1.get(); } - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } template -void test_pool_ref_counting(const size_t stream_parameter, Ts &&... ts) { +void test_pool_ref_counting(const size_t executor_parameter, Ts &&...ts) { // init ppol - stream_pool::init(stream_parameter, std::forward(ts)...); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); { // Allocating - auto test1 = stream_pool::get_interface(0); - auto load1 = stream_pool::get_current_load(0); + auto test1 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load1 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load1 == 0); Interface test1_interface = std::get<0>(test1); size_t test1_index = std::get<1>(test1); - auto test2 = stream_pool::get_interface(0); - auto load2 = stream_pool::get_current_load(0); + auto test2 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load2 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load2 == 1); Interface test2_interface = std::get<0>(test2); // auto fut = test2_interface.get_future(); size_t test2_index = std::get<1>(test2); - auto test3 = stream_pool::get_interface(0); - auto load3 = stream_pool::get_current_load(0); + auto test3 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load3 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load3 == 1); Interface test3_interface = std::get<0>(test3); size_t test3_index = std::get<1>(test3); - auto test4 = stream_pool::get_interface(0); - auto load4 = stream_pool::get_current_load(0); + auto test4 = + cppuddle::executor_recycling::executor_pool::get_interface(0); + auto load4 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); Interface test4_interface = std::get<0>(test4); size_t test4_index = std::get<1>(test4); assert(load4 == 2); // Releasing - stream_pool::release_interface(test4_index, 0); - load4 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test4_index, 0); + load4 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load4 == 1); - stream_pool::release_interface(test3_index, 0); - load3 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test3_index, 0); + load3 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load3 == 1); - stream_pool::release_interface(test2_index, 0); - load2 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test2_index, 0); + load2 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load2 == 0); - stream_pool::release_interface(test1_index, 0); - load1 = stream_pool::get_current_load(0); + cppuddle::executor_recycling::executor_pool::release_interface( + test1_index, 0); + load1 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load1 == 0); } // Clear - auto load0 = stream_pool::get_current_load(0); + auto load0 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load0 == 0); - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } template -void test_pool_wrappers(const size_t stream_parameter, Ts &&... ts) { - using wrapper_type = stream_interface; +void test_pool_wrappers(const size_t executor_parameter, Ts &&...ts) { + using wrapper_type = + cppuddle::executor_recycling::executor_interface; // init ppol - stream_pool::init(stream_parameter, std::forward(ts)...); + cppuddle::executor_recycling::executor_pool::init( + executor_parameter, std::forward(ts)...); { wrapper_type test1{0}; - auto load = stream_pool::get_current_load(0); + auto load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 0); wrapper_type test2{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); // auto fut = test2.get_future(); assert(load == 1); wrapper_type test3{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 1); wrapper_type test4{0}; - load = stream_pool::get_current_load(0); + load = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load == 2); // Check availability method: - bool avail = stream_pool::interface_available(1, 0); + bool avail = + cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(1, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available(2, 0); + avail = cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(2, 0); assert(avail == false); // NOLINT - avail = stream_pool::interface_available(3, 0); + avail = cppuddle::executor_recycling::executor_pool::interface_available< + Interface, Pool>(3, 0); assert(avail == true); // NOLINT } - auto load0 = stream_pool::get_current_load(0); + auto load0 = + cppuddle::executor_recycling::executor_pool::get_current_load(0); assert(load0 == 0); - stream_pool::cleanup(); + cppuddle::executor_recycling::executor_pool::cleanup(); } #endif From 3f88250b69a1c979312d1bc11e7252438718ee14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 8 Mar 2024 12:01:16 +0100 Subject: [PATCH 09/19] Move pool implementation into details --- .../executor_pools_management.hpp | 14 ++++---- .../executor_pools_interface.hpp | 32 +++++++++++++++++++ .../hip_recycling_allocators.hpp | 2 +- .../sycl_recycling_allocators.hpp | 2 +- include/hip_buffer_util.hpp | 2 +- include/stream_manager.hpp | 2 +- include/sycl_buffer_util.hpp | 2 +- tests/stream_test.hpp | 2 +- 8 files changed, 45 insertions(+), 13 deletions(-) rename include/cppuddle/executor_recycling/{ => detail}/executor_pools_management.hpp (97%) create mode 100644 include/cppuddle/executor_recycling/executor_pools_interface.hpp diff --git a/include/cppuddle/executor_recycling/executor_pools_management.hpp b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp similarity index 97% rename from include/cppuddle/executor_recycling/executor_pools_management.hpp rename to include/cppuddle/executor_recycling/detail/executor_pools_management.hpp index 16776031..6a89025b 100644 --- a/include/cppuddle/executor_recycling/executor_pools_management.hpp +++ b/include/cppuddle/executor_recycling/detail/executor_pools_management.hpp @@ -41,8 +41,8 @@ enum class execution_space_mode { global, independent }; namespace cppuddle { namespace executor_recycling { - namespace detail { + /// Turns a std::array_mutex into an scoped lock template auto make_scoped_lock_from_array(mutex_array_t& mutexes) @@ -50,7 +50,6 @@ auto make_scoped_lock_from_array(mutex_array_t& mutexes) return std::apply([](auto&... mutexes) { return std::scoped_lock{mutexes...}; }, mutexes); } -} // namespace detail template class round_robin_pool_impl { private: @@ -202,7 +201,7 @@ class executor_pool { static void init(size_t number_of_executors, Ts ... executor_args) { /* static_assert(sizeof...(Ts) == sizeof...(Ts) && cppuddle::max_number_gpus == 1, */ /* "deprecated executor_pool::init does not support multigpu"); */ - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); instance().executorpools.emplace_back(number_of_executors, executor_args...); assert(instance().executorpools.size() <= cppuddle::max_number_gpus); } @@ -210,7 +209,7 @@ class executor_pool { /// Multi-GPU init where executors / interfaces on all GPUs are initialized with the same arguments template static void init_all_executor_pools(size_t number_of_executors, Ts ... executor_args) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_executors > 0) { for (size_t gpu_id = 0; gpu_id < cppuddle::max_number_gpus; gpu_id++) { instance().select_gpu_function(gpu_id); @@ -225,7 +224,7 @@ class executor_pool { /// (useful for executor that expect an GPU-id during construction) template static void init_executor_pool(size_t gpu_id, size_t number_of_executors, Ts ... executor_args) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); if (number_of_executors > 0) { instance().select_gpu_function(gpu_id); instance().executorpools.emplace_back(number_of_executors, @@ -236,7 +235,7 @@ class executor_pool { // TODO add/rename into finalize? static void cleanup() { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); assert(instance().executorpools.size() == cppuddle::max_number_gpus); instance().executorpools.clear(); } @@ -269,7 +268,7 @@ class executor_pool { /* } */ static void set_device_selector(std::function select_gpu_function) { - auto guard = detail::make_scoped_lock_from_array(instance().gpu_mutexes); + auto guard = make_scoped_lock_from_array(instance().gpu_mutexes); instance().select_gpu_function = select_gpu_function; } @@ -415,6 +414,7 @@ template class executor_interface { }; #endif +} // namespace detail } // namespace executor_recycling } // namespace cppuddle diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp new file mode 100644 index 00000000..dac9f170 --- /dev/null +++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp @@ -0,0 +1,32 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef EXECUTOR_POOLS_INTERFACE_HPP +#define EXECUTOR_POOLS_INTERFACE_HPP + +#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp" + +namespace cppuddle { +namespace executor_recycling { + +template +using round_robin_pool_impl = + detail::round_robin_pool_impl; + +template +using priority_pool_impl = + detail::priority_pool_impl; + +using executor_pool = + detail::executor_pool; + +template +using executor_interface = + detail::executor_interface; + +} +} + +#endif diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp index d4b2da3c..36432820 100644 --- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2024 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp index 233afe71..7ea9999c 100644 --- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2020-2024 Gregor Daiß +// Copyright (c) 2020-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 9bc8ccc3..3f0b3034 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c): 2024 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 940620d5..25c4a080 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -6,7 +6,7 @@ #ifndef STREAM_MANAGER_HPP #define STREAM_MANAGER_HPP -#include "cppuddle/executor_recycling/executor_pools_management.hpp" +#include "cppuddle/executor_recycling/executor_pools_interface.hpp" template using round_robin_pool diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index ad64a9dc..7ce66d93 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -1,4 +1,4 @@ -// Copyright (c: 2024 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index 1dfa60db..b793fe9c 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -10,7 +10,7 @@ #include #include #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" -#include "cppuddle/executor_recycling/executor_pools_management.hpp"" +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" template void test_pool_memcpy(const size_t executor_parameter, Ts &&...ts) { From 5740925f7d9b785ba9a500b0f08f58abd0fb8557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 8 Mar 2024 15:55:08 +0100 Subject: [PATCH 10/19] Move aggregation functionality into namespace --- include/aggregation_manager.hpp | 1154 +--------------- .../executor_pools_interface.hpp | 4 +- .../kernel_aggregation_management.hpp | 1161 +++++++++++++++++ 3 files changed, 1177 insertions(+), 1142 deletions(-) create mode 100644 include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 70acfd61..030150f9 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -1,1153 +1,27 @@ -// Copyright (c) 2022-2024 Gregor Daiß +// Copyright (c) 2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef WORK_AGGREGATION_MANAGER -#define WORK_AGGREGATION_MANAGER +#ifndef AGGREGATION_MANAGER_HPP +#define AGGREGATION_MANAGER_HPP -#ifndef CPPUDDLE_HAVE_HPX -#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON" -#endif - -#include -//#define DEBUG_AGGREGATION_CALLS 1 - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) -// required for defining type traits using cuda executor as underlying -// aggregation executors -#include -#endif - -#include -#include - -#include "../include/buffer_manager.hpp" -#include "../include/stream_manager.hpp" -#include "cppuddle/common/config.hpp" - -#ifndef CPPUDDLE_HAVE_HPX_MUTEX -#pragma message \ - "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF" -#pragma message \ - "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex" -#endif -namespace cppuddle { - using aggregation_mutex_t = hpx::mutex; -} - -//=============================================================================== -//=============================================================================== -// Helper functions/classes - -/// Constructs a tuple with copies (to store temporaries in aggregated function -/// calls) yet also supporting references (on the users own risk...) -template -std::tuple make_tuple_supporting_references(Ts &&...ts) { - return std::tuple{std::forward(ts)...}; -} - -/// Print some specific values that we can, but don't bother for most types -/// (such as vector) -template std::string print_if_possible(T val) { - if constexpr (std::is_convertible_v) { - return val; - } else if constexpr (std::is_integral_v || std::is_floating_point_v) { - return std::to_string(val); - } else if constexpr (std::is_pointer_v) { - // Pretty printing pointer sort of only works well with %p - // TODO Try using std::format as soon as we can move to C++20 - std::unique_ptr debug_string(new char[128]()); - snprintf(debug_string.get(), 128, "%p", val); - return std::string(debug_string.get()); - } else { - return std::string("cannot print value"); - } -} - -/// Helper class for the helper class that prints tuples -- do not use this -/// directly -template -void print_tuple(const TupType &_tup, std::index_sequence) { - (..., (hpx::cout << (I == 0 ? "" : ", ") - << print_if_possible(std::get(_tup)))); -} - -/// Helper class for printing tuples (first component should be a function -/// pointer, remaining components the function arguments) -template void print_tuple(const std::tuple &_tup) { - // Use pointer and sprintf as boost::format refused to NOT cast the pointer - // address to 1... - // TODO Try using std::format as soon as we can move to C++20 - std::unique_ptr debug_string(new char[128]()); - snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (", - std::get<0>(_tup)); - hpx::cout << debug_string.get(); - print_tuple(_tup, std::make_index_sequence()); - hpx::cout << ")"; -} - -//=============================================================================== -//=============================================================================== -template -void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) { - hpx::apply(exec, std::forward(f), std::forward(ts)...); -} - -template -hpx::lcos::future exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) { - return hpx::async(exec, std::forward(f), std::forward(ts)...); -} - -/// Manages the launch conditions for aggregated function calls -/// type/value-errors -/** Launch conditions: All slice executors must have called the same function - * (tracked by future all_slices_ready) - * AND - * Previous aggregated_function_call on the same Executor must have been - * launched (tracked by future stream_future) - * All function calls received from the slice executors are checked if they - * match the first one in both types and values (throws exception otherwise) - */ - -template class aggregated_function_call { -private: - std::atomic slice_counter = 0; - - /// Promise to be set when all slices have visited this function call - /* hpx::lcos::local::promise slices_ready_promise; */ - /// Tracks if all slices have visited this function call - /* hpx::lcos::future all_slices_ready = slices_ready_promise.get_future(); */ - /// How many slices can we expect? - const size_t number_slices; - const bool async_mode; - - Executor &underlying_executor; - -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) -#pragma message \ - "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..." - /// Stores the function call of the first slice as reference for error - /// checking - std::any function_tuple; - /// Stores the string of the first function call for debug output - std::string debug_type_information; - cppuddle::aggregation_mutex_t debug_mut; -#endif +#include "cppuddle/kernel_aggregation/kernel_aggregation_management.hpp" - std::vector> potential_async_promises{}; +using Aggregated_Executor_Modes = + cppuddle::kernel_aggregation::aggregated_executor_modes; -public: - aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec) - : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) { - if (async_mode) - potential_async_promises.resize(number_slices); - } - ~aggregated_function_call(void) { - // All slices should have done this call - assert(slice_counter == number_slices); - // assert(!all_slices_ready.valid()); - } - /// Returns true if all required slices have visited this point - bool sync_aggregation_slices(hpx::lcos::future &stream_future) { - assert(!async_mode); - assert(potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - if (local_counter == number_slices - 1) { - return true; - } - else return false; - } - template - void post_when(hpx::lcos::future &stream_future, F &&f, Ts &&...ts) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - // needed for concurrent access to function_tuple and debug_type_information - // Not required for normal use - std::lock_guard guard(debug_mut); -#endif - assert(!async_mode); - assert(potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - - if (local_counter == 0) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto tmp_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - function_tuple = tmp_tuple; - debug_type_information = typeid(decltype(tmp_tuple)).name(); -#endif - - } else { - // - // This scope checks if both the type and the values of the current call - // match the original call To be used in debug build... - // -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto comparison_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - try { - auto orig_call_tuple = - std::any_cast(function_tuple); - if (comparison_tuple != orig_call_tuple) { - throw std::runtime_error( - "Values of post function arguments (or function " - "itself) do not match "); - } - } catch (const std::bad_any_cast &e) { - hpx::cout - << "\nMismatched types error in aggregated post call of executor " - << ": " << e.what() << "\n"; - hpx::cout << "Expected types:\t\t " - << boost::core::demangle(debug_type_information.c_str()); - hpx::cout << "\nGot types:\t\t " - << boost::core::demangle( - typeid(decltype(comparison_tuple)).name()) - << "\n" - << std::endl; - // throw; - } catch (const std::runtime_error &e) { - hpx::cout - << "\nMismatched values error in aggregated post call of executor " - << ": " << e.what() << std::endl; - hpx::cout << "Types (matched):\t " - << boost::core::demangle(debug_type_information.c_str()); - auto orig_call_tuple = - std::any_cast(function_tuple); - hpx::cout << "\nExpected values:\t "; - print_tuple(orig_call_tuple); - hpx::cout << "\nGot values:\t\t "; - print_tuple(comparison_tuple); - hpx::cout << std::endl << std::endl; - // throw; - } -#endif - } - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - // Check exit criteria: Launch function call continuation by setting the - // slices promise - if (local_counter == number_slices - 1) { - exec_post_wrapper(underlying_executor, std::forward(f), std::forward(ts)...); - //slices_ready_promise.set_value(); - } - } - template - hpx::lcos::future async_when(hpx::lcos::future &stream_future, - F &&f, Ts &&...ts) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - // needed for concurrent access to function_tuple and debug_type_information - // Not required for normal use - std::lock_guard guard(debug_mut); -#endif - assert(async_mode); - assert(!potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - if (local_counter == 0) { -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto tmp_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - function_tuple = tmp_tuple; - debug_type_information = typeid(decltype(tmp_tuple)).name(); -#endif - } else { - // - // This scope checks if both the type and the values of the current call - // match the original call To be used in debug build... - // -#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) - auto comparison_tuple = - make_tuple_supporting_references(f, std::forward(ts)...); - try { - auto orig_call_tuple = - std::any_cast(function_tuple); - if (comparison_tuple != orig_call_tuple) { - throw std::runtime_error( - "Values of async function arguments (or function " - "itself) do not match "); - } - } catch (const std::bad_any_cast &e) { - hpx::cout - << "\nMismatched types error in aggregated async call of executor " - << ": " << e.what() << "\n"; - hpx::cout << "Expected types:\t\t " - << boost::core::demangle(debug_type_information.c_str()); - hpx::cout << "\nGot types:\t\t " - << boost::core::demangle( - typeid(decltype(comparison_tuple)).name()) - << "\n" - << std::endl; - // throw; - } catch (const std::runtime_error &e) { - hpx::cout - << "\nMismatched values error in aggregated async call of executor " - << ": " << e.what() << std::endl; - hpx::cout << "Types (matched):\t " - << boost::core::demangle(debug_type_information.c_str()); - auto orig_call_tuple = - std::any_cast(function_tuple); - hpx::cout << "\nExpected values:\t "; - print_tuple(orig_call_tuple); - hpx::cout << "\nGot values:\t\t "; - print_tuple(comparison_tuple); - hpx::cout << std::endl << std::endl; - // throw; - } -#endif - } - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - assert(potential_async_promises.size() == number_slices); - hpx::lcos::future ret_fut = - potential_async_promises[local_counter].get_future(); - if (local_counter == number_slices - 1) { - /* slices_ready_promise.set_value(); */ - auto fut = exec_async_wrapper( - underlying_executor, std::forward(f), std::forward(ts)...); - fut.then([this](auto &&fut) { - for (auto &promise : potential_async_promises) { - promise.set_value(); - } - }); - } - // Check exit criteria: Launch function call continuation by setting the - // slices promise - return ret_fut; - } - template - hpx::lcos::shared_future wrap_async(hpx::lcos::future &stream_future, - F &&f, Ts &&...ts) { - assert(async_mode); - assert(!potential_async_promises.empty()); - const size_t local_counter = slice_counter++; - assert(local_counter < number_slices); - assert(slice_counter < number_slices + 1); - assert(potential_async_promises.size() == number_slices); - hpx::lcos::shared_future ret_fut = - potential_async_promises[local_counter].get_shared_future(); - if (local_counter == number_slices - 1) { - auto fut = f(std::forward(ts)...); - fut.then([this](auto &&fut) { - // TODO just use one promise - for (auto &promise : potential_async_promises) { - promise.set_value(); - } - }); - } - return ret_fut; - } - // We need to be able to copy or no-except move for std::vector.. - aggregated_function_call(const aggregated_function_call &other) = default; - aggregated_function_call & - operator=(const aggregated_function_call &other) = default; - aggregated_function_call(aggregated_function_call &&other) = default; - aggregated_function_call & - operator=(aggregated_function_call &&other) = default; -}; - -//=============================================================================== -//=============================================================================== - -enum class Aggregated_Executor_Modes { EAGER = 1, STRICT, ENDLESS }; -/// Declaration since the actual allocator is only defined after the Executors template -class Allocator_Slice; - -/// Executor Class that aggregates function calls for specific kernels -/** Executor is not meant to be used directly. Instead it yields multiple - * Executor_Slice objects. These serve as interfaces. Slices from the same - * Aggregated_Executor are meant to execute the same function calls but on - * different data (i.e. different tasks) - */ -template class Aggregated_Executor { -private: - //=============================================================================== - // Misc private avariables: - // - std::atomic slices_exhausted; - - std::atomic executor_slices_alive; - std::atomic buffers_in_use; - std::atomic dealloc_counter; - - const Aggregated_Executor_Modes mode; - const size_t max_slices; - std::atomic current_slices; - /// Wrapper to the executor interface from the stream pool - /// Automatically hooks into the stream_pools reference counting - /// for cpu/gpu load balancing - std::unique_ptr>> executor_wrapper; - -public: - size_t gpu_id; - // Subclasses - - /// Slice class - meant as a scope interface to the aggregated executor - class Executor_Slice { - public: - Aggregated_Executor &parent; - private: - /// Executor is a slice of this aggregated_executor - /// How many functions have been called - required to enforce sequential - /// behaviour of kernel launches - size_t launch_counter{0}; - size_t buffer_counter{0}; - bool notify_parent_about_destruction{true}; - - public: - /// How many slices are there overall - required to check the launch - /// criteria - const size_t number_slices; - const size_t id; - using executor_t = Executor; - Executor_Slice(Aggregated_Executor &parent, const size_t slice_id, - const size_t number_slices) - : parent(parent), notify_parent_about_destruction(true), - number_slices(number_slices), id(slice_id) { - } - ~Executor_Slice(void) { - // Don't notify parent if we moved away from this executor_slice - if (notify_parent_about_destruction) { - // Executor should be done by the time of destruction - // -> check here before notifying parent - - // parent still in execution mode? - assert(parent.slices_exhausted == true); - // all kernel launches done? - assert(launch_counter == parent.function_calls.size()); - // Notifiy parent that this aggregation slice is one - parent.reduce_usage_counter(); - } - } - Executor_Slice(const Executor_Slice &other) = delete; - Executor_Slice &operator=(const Executor_Slice &other) = delete; - Executor_Slice(Executor_Slice &&other) - : parent(other.parent), launch_counter(std::move(other.launch_counter)), - buffer_counter(std::move(other.buffer_counter)), - number_slices(std::move(other.number_slices)), - id(std::move(other.id)) { - other.notify_parent_about_destruction = false; - } - Executor_Slice &operator=(Executor_Slice &&other) { - parent = other.parent; - launch_counter = std::move(other.launch_counter); - buffer_counter = std::move(other.buffer_counter); - number_slices = std::move(other.number_slices); - id = std::move(other.id); - other.notify_parent_about_destruction = false; - } - template - Allocator_Slice make_allocator() { - return Allocator_Slice(*this); - } - bool sync_aggregation_slices() { - assert(parent.slices_exhausted == true); - auto ret = parent.sync_aggregation_slices(launch_counter); - launch_counter++; - return ret; - } - template void post(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - parent.post(launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - } - template - hpx::lcos::future async(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - hpx::lcos::future ret_fut = parent.async( - launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - return ret_fut; - } - - // OneWay Execution - template - friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t, - Executor_Slice& exec, F&& f, Ts&&... ts) - { - return exec.post(std::forward(f), std::forward(ts)...); - } - - // TwoWay Execution - template - friend decltype(auto) tag_invoke( - hpx::parallel::execution::async_execute_t, Executor_Slice& exec, - F&& f, Ts&&... ts) - { - return exec.async( - std::forward(f), std::forward(ts)...); - } - - template - hpx::lcos::shared_future wrap_async(F &&f, Ts &&...ts) { - // we should only execute function calls once all slices - // have been given away (-> Executor Slices start) - assert(parent.slices_exhausted == true); - hpx::lcos::shared_future ret_fut = parent.wrap_async( - launch_counter, std::forward(f), std::forward(ts)...); - launch_counter++; - return ret_fut; - } - - /// Get new aggregated buffer (might have already been allocated been - /// allocated by different slice) - template T *get(const size_t size) { - assert(parent.slices_exhausted == true); - T *aggregated_buffer = - parent.get(size, buffer_counter); - buffer_counter++; - assert(buffer_counter > 0); - return aggregated_buffer; - } - - Executor& get_underlying_executor(void) { - assert(parent.executor_wrapper); - return *(parent.executor_wrapper); - } - }; - - //=============================================================================== +using Allocator_Slice = + cppuddle::kernel_aggregation::allocator_slice; - hpx::lcos::local::promise slices_full_promise; - /// Promises with the slice executors -- to be set when the starting criteria - /// is met - std::vector> executor_slices; - /// List of aggregated function calls - function will be launched when all - /// slices have called it - std::deque> function_calls; - /// For synchronizing the access to the function calls list - cppuddle::aggregation_mutex_t mut; - - /// Data entry for a buffer allocation: void* pointer, size_t for - /// buffer-size, atomic for the slice counter, location_id, gpu_id - using buffer_entry_t = - std::tuple, bool, const size_t, size_t>; - /// Keeps track of the aggregated buffer allocations done in all the slices - std::deque buffer_allocations; - /// Map pointer to deque index for fast access in the deallocations - std::unordered_map buffer_allocations_map; - /// For synchronizing the access to the buffer_allocations - cppuddle::aggregation_mutex_t buffer_mut; - std::atomic buffer_counter = 0; - - /// Get new buffer OR get buffer already allocated by different slice - template - T *get(const size_t size, const size_t slice_alloc_counter) { - assert(slices_exhausted == true); - assert(executor_wrapper); - assert(executor_slices_alive == true); - // Add aggreated buffer entry in case it hasn't happened yet for this call - // First: Check if it already has happened - if (buffer_counter <= slice_alloc_counter) { - // we might be the first! Lock... - std::lock_guard guard(buffer_mut); - // ... and recheck - if (buffer_counter <= slice_alloc_counter) { - constexpr bool manage_content_lifetime = false; - buffers_in_use = true; - - // Default location -- useful for GPU builds as we otherwise create way too - // many different buffers for different aggregation sizes on different GPUs - /* size_t location_id = gpu_id * instances_per_gpu; */ - // Use integer conversion to only use 0 16 32 ... as buckets - size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; -#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS - if (max_slices == 1) { - // get prefered location: aka the current hpx threads location - // Usually handy for CPU builds where we want to use the buffers - // close to the current CPU core - /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ - /* location_id = (gpu_id) * instances_per_gpu; */ - // division makes sure that we always use the same instance to store our gpu buffers. - } -#endif - // Get shiny and new buffer that will be shared between all slices - // Buffer might be recycled from previous allocations by the - // buffer_interface... - T *aggregated_buffer = - cppuddle::memory_recycling::detail::buffer_interface::get< - T, Host_Allocator>(size, manage_content_lifetime, location_id, - gpu_id); - // Create buffer entry for this buffer - buffer_allocations.emplace_back(static_cast(aggregated_buffer), - size, 1, true, location_id, gpu_id); - -#ifndef NDEBUG - // if previousely used the buffer should not be in usage anymore - const auto exists = buffer_allocations_map.count( - static_cast(aggregated_buffer)); - if (exists > 0) { - const auto previous_usage_id = - buffer_allocations_map[static_cast(aggregated_buffer)]; - const auto &valid = - std::get<3>(buffer_allocations[previous_usage_id]); - assert(!valid); - } -#endif - buffer_allocations_map.insert_or_assign(static_cast(aggregated_buffer), - buffer_counter); - - assert (buffer_counter == slice_alloc_counter); - buffer_counter = buffer_allocations.size(); - - // Return buffer - return aggregated_buffer; - } - } - assert(buffers_in_use == true); - assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid - assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1); - - // Buffer entry should already exist: - T *aggregated_buffer = static_cast( - std::get<0>(buffer_allocations[slice_alloc_counter])); - // Error handling: Size is wrong? - assert(size == std::get<1>(buffer_allocations[slice_alloc_counter])); - // Notify that one more slice has visited this buffer allocation - std::get<2>(buffer_allocations[slice_alloc_counter])++; - return aggregated_buffer; - } - - /// Notify buffer list that one slice is done with the buffer - template - void mark_unused(T *p, const size_t size) { - assert(slices_exhausted == true); - assert(executor_wrapper); - - void *ptr_key = static_cast(p); - size_t slice_alloc_counter = buffer_allocations_map[p]; - - assert(slice_alloc_counter < buffer_allocations.size()); - /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = - buffer_allocations[slice_alloc_counter];*/ - auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); - const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); - auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); - auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); - const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); - const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]); - assert(valid); - T *buffer_pointer = static_cast(buffer_pointer_void); - - assert(buffer_size == size); - assert(p == buffer_pointer); - // assert(buffer_pointer == p || buffer_pointer == nullptr); - // Slice is done with this buffer - buffer_allocation_counter--; - // Check if all slices are done with this buffer? - if (buffer_allocation_counter == 0) { - // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage - std::lock_guard guard(buffer_mut); - // Only mark unused if another buffer has not done so already (and marked - // it as invalid) - if (valid) { - assert(buffers_in_use == true); - cppuddle::memory_recycling::detail::buffer_interface::mark_unused< - T, Host_Allocator>(buffer_pointer, buffer_size, location_id, - gpu_id); - // mark buffer as invalid to prevent any other slice from marking the - // buffer as unused - valid = false; - - const size_t current_deallocs = ++dealloc_counter; - if (current_deallocs == buffer_counter) { - std::lock_guard guard(mut); - buffers_in_use = false; - if (!executor_slices_alive && !buffers_in_use) { - slices_exhausted = false; - // Release executor - executor_wrapper.reset(nullptr); - } - } - } - } - } - - //=============================================================================== - // Public Interface -public: - hpx::lcos::future current_continuation; - hpx::lcos::future last_stream_launch_done; - std::atomic overall_launch_counter = 0; - - /// Only meant to be accessed by the slice executors - bool sync_aggregation_slices(const size_t slice_launch_counter) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, false, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].sync_aggregation_slices( - last_stream_launch_done); - } - } - - return function_calls[slice_launch_counter].sync_aggregation_slices( - last_stream_launch_done); - } - - /// Only meant to be accessed by the slice executors - template - void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, false, *executor_wrapper); - overall_launch_counter = function_calls.size(); - function_calls[slice_launch_counter].post_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - return; - } - } - - function_calls[slice_launch_counter].post_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - return; - } - - /// Only meant to be accessed by the slice executors - template - hpx::lcos::future async(const size_t slice_launch_counter, F &&f, - Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, true, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].async_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - } - - return function_calls[slice_launch_counter].async_when( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - /// Only meant to be accessed by the slice executors - template - hpx::lcos::shared_future wrap_async(const size_t slice_launch_counter, F &&f, - Ts &&...ts) { - std::lock_guard guard(mut); - assert(slices_exhausted == true); - assert(executor_wrapper); - // Add function call object in case it hasn't happened for this launch yet - if (overall_launch_counter <= slice_launch_counter) { - /* std::lock_guard guard(mut); */ - if (overall_launch_counter <= slice_launch_counter) { - function_calls.emplace_back(current_slices, true, *executor_wrapper); - overall_launch_counter = function_calls.size(); - return function_calls[slice_launch_counter].wrap_async( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - } - - return function_calls[slice_launch_counter].wrap_async( - last_stream_launch_done, std::forward(f), std::forward(ts)...); - } - - bool slice_available(void) { - std::lock_guard guard(mut); - return !slices_exhausted; - } - - std::optional> request_executor_slice() { - std::lock_guard guard(mut); - if (!slices_exhausted) { - const size_t local_slice_id = ++current_slices; - if (local_slice_id == 1) { - // Cleanup leftovers from last run if any - // TODO still required? Should be clean here already - function_calls.clear(); - overall_launch_counter = 0; - std::lock_guard guard(buffer_mut); -#ifndef NDEBUG - for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, - buffer_allocation_counter, valid, location_id, device_id] = - buffer_entry; - assert(!valid); - } -#endif - buffer_allocations.clear(); - buffer_allocations_map.clear(); - buffer_counter = 0; - - assert(executor_slices_alive == false); - assert(buffers_in_use == false); - executor_slices_alive = true; - buffers_in_use = false; - dealloc_counter = 0; - - if (mode == Aggregated_Executor_Modes::STRICT ) { - slices_full_promise = hpx::lcos::local::promise{}; - } - } - - // Create Executor Slice future -- that will be returned later - hpx::lcos::future ret_fut; - if (local_slice_id < max_slices) { - executor_slices.emplace_back(hpx::lcos::local::promise{}); - ret_fut = - executor_slices[local_slice_id - 1].get_future(); - } else { - launched_slices = current_slices; - ret_fut = hpx::make_ready_future(Executor_Slice{*this, - executor_slices.size(), launched_slices}); - } - - // Are we the first slice? If yes, add continuation set the - // Executor_Slice - // futures to ready if the launch conditions are met - if (local_slice_id == 1) { - // Redraw executor - assert(!executor_wrapper); - stream_pool::select_device>(gpu_id); - executor_wrapper.reset( - new stream_interface>(gpu_id)); - // Renew promise that all slices will be ready as the primary launch - // criteria... - hpx::lcos::shared_future fut; - if (mode == Aggregated_Executor_Modes::EAGER || - mode == Aggregated_Executor_Modes::ENDLESS) { - // Fallback launch condidtion: Launch as soon as the underlying stream - // is ready - /* auto slices_full_fut = slices_full_promise.get_future(); */ - stream_pool::select_device>(gpu_id); - auto exec_fut = (*executor_wrapper).get_future(); - /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */ - fut = std::move(exec_fut); - } else { - auto slices_full_fut = slices_full_promise.get_shared_future(); - // Just use the slices launch condition - fut = std::move(slices_full_fut); - } - // Launch all executor slices within this continuation - current_continuation = fut.then([this](auto &&fut) { - std::lock_guard guard(mut); - slices_exhausted = true; - launched_slices = current_slices; - size_t id = 0; - for (auto &slice_promise : executor_slices) { - slice_promise.set_value( - Executor_Slice{*this, id, launched_slices}); - id++; - } - executor_slices.clear(); - }); - } - if (local_slice_id >= max_slices && - mode != Aggregated_Executor_Modes::ENDLESS) { - slices_exhausted = true; // prevents any more threads from entering - // before the continuation is launched - /* launched_slices = current_slices; */ - /* size_t id = 0; */ - /* for (auto &slice_promise : executor_slices) { */ - /* slice_promise.set_value( */ - /* Executor_Slice{*this, id, launched_slices}); */ - /* id++; */ - /* } */ - /* executor_slices.clear(); */ - if (mode == Aggregated_Executor_Modes::STRICT ) { - slices_full_promise.set_value(); // Trigger slices launch condition continuation - } - // that continuation will set all executor slices so far handed out to ready - } - return ret_fut; - } else { - // Return empty optional as failure - return std::optional>{}; - } - } - size_t launched_slices; - void reduce_usage_counter(void) { - /* std::lock_guard guard(mut); */ - assert(slices_exhausted == true); - assert(executor_wrapper); - assert(executor_slices_alive == true); - assert(launched_slices >= 1); - assert(current_slices >= 0 && current_slices <= launched_slices); - const size_t local_slice_id = --current_slices; - // Last slice goes out scope? - if (local_slice_id == 0) { - // Mark executor fit for reusage - std::lock_guard guard(mut); - executor_slices_alive = false; - if (!executor_slices_alive && !buffers_in_use) { - // Release executor - slices_exhausted = false; - executor_wrapper.reset(nullptr); - } - } - } - ~Aggregated_Executor(void) { - - assert(current_slices == 0); - assert(executor_slices_alive == false); - assert(buffers_in_use == false); - - if (mode != Aggregated_Executor_Modes::STRICT ) { - slices_full_promise.set_value(); // Trigger slices launch condition continuation - } - - // Cleanup leftovers from last run if any - function_calls.clear(); - overall_launch_counter = 0; -#ifndef NDEBUG - for (const auto &buffer_entry : buffer_allocations) { - const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, - valid, location_id, device_id] = buffer_entry; - assert(!valid); - } -#endif - buffer_allocations.clear(); - buffer_allocations_map.clear(); - buffer_counter = 0; - - assert(buffer_allocations.empty()); - assert(buffer_allocations_map.empty()); - } - - Aggregated_Executor(const size_t number_slices, - Aggregated_Executor_Modes mode, const size_t gpu_id = 0) - : max_slices(number_slices), current_slices(0), slices_exhausted(false), - dealloc_counter(0), mode(mode), executor_slices_alive(false), - buffers_in_use(false), gpu_id(gpu_id), - executor_wrapper(nullptr), - current_continuation(hpx::make_ready_future()), - last_stream_launch_done(hpx::make_ready_future()) {} - // Not meant to be copied or moved - Aggregated_Executor(const Aggregated_Executor &other) = delete; - Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete; - Aggregated_Executor(Aggregated_Executor &&other) = delete; - Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete; -}; - -template -class Allocator_Slice { -private: - typename Aggregated_Executor::Executor_Slice &executor_reference; - Aggregated_Executor &executor_parent; - -public: - using value_type = T; - Allocator_Slice( - typename Aggregated_Executor::Executor_Slice &executor) - : executor_reference(executor), executor_parent(executor.parent) {} - template - explicit Allocator_Slice( - Allocator_Slice const &) noexcept {} - T *allocate(std::size_t n) { - T *data = executor_reference.template get(n); - return data; - } - void deallocate(T *p, std::size_t n) { - /* executor_reference.template mark_unused(p, n); */ - executor_parent.template mark_unused(p, n); - } - template - inline void construct(T *p, Args... args) noexcept { - // Do nothing here - we reuse the content of the last owner - } - void destroy(T *p) { - // Do nothing here - Contents will be destroyed when the buffer manager is - // destroyed, not before - } -}; -template -constexpr bool -operator==(Allocator_Slice const &, - Allocator_Slice const &) noexcept { - return false; -} -template -constexpr bool -operator!=(Allocator_Slice const &, - Allocator_Slice const &) noexcept { - return true; -} - -namespace hpx { namespace parallel { namespace execution { - // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with - // compatible unlying executor types - /* template */ - /* struct is_one_way_executor::Executor_Slice> */ - /* : std::true_type */ - /* {}; */ - /* template */ - /* struct is_two_way_executor::Executor_Slice> */ - /* : std::true_type */ - /* {}; */ - -#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) - // Workaround for the meantime: Manually create traits for compatible types: - template<> - struct is_one_way_executor::Executor_Slice> - : std::true_type - {}; - template<> - struct is_two_way_executor::Executor_Slice> - : std::true_type - {}; -#endif -}}} - -//=============================================================================== -//=============================================================================== -// Pool Strategy: +template +using Aggregated_Executor = + cppuddle::kernel_aggregation::Aggregated_Executor; template -class aggregation_pool { -public: - /// interface - template - static void init(size_t number_of_executors, size_t slices_per_executor, - Aggregated_Executor_Modes mode, size_t num_devices = 1) { - if (is_initialized) { - throw std::runtime_error( - std::string("Trying to initialize cppuddle aggregation pool twice") + - " Agg pool name: " + std::string(kernelname)); - } - if (num_devices > cppuddle::max_number_gpus) { - throw std::runtime_error( - std::string( - "Trying to initialize aggregation with more devices than the " - "maximum number of GPUs given at compiletime") + - " Agg pool name: " + std::string(kernelname)); - } - number_devices = num_devices; - for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { - - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(instance()[gpu_id].aggregation_executor_pool.empty()); - for (int i = 0; i < number_of_executors; i++) { - instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, - mode, gpu_id); - } - instance()[gpu_id].slices_per_executor = slices_per_executor; - instance()[gpu_id].mode = mode; - } - is_initialized = true; - } - - /// Will always return a valid executor slice - static decltype(auto) request_executor_slice(void) { - if (!is_initialized) { - throw std::runtime_error( - std::string("Trying to use cppuddle aggregation pool without first calling init") + - " Agg poolname: " + std::string(kernelname)); - } - const size_t gpu_id = cppuddle::get_device_id(number_devices); - /* const size_t gpu_id = 1; */ - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(!instance()[gpu_id].aggregation_executor_pool.empty()); - std::optional::Executor_Slice>> - ret; - size_t local_id = (instance()[gpu_id].current_interface) % - instance()[gpu_id].aggregation_executor_pool.size(); - ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - // Expected case: current aggregation executor is free - if (ret.has_value()) { - return ret; - } - // current interface is bad -> find free one - size_t abort_counter = 0; - const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; - do { - local_id = (++(instance()[gpu_id].current_interface)) % // increment interface - instance()[gpu_id].aggregation_executor_pool.size(); - ret = - instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - if (ret.has_value()) { - return ret; - } - abort_counter++; - } while (abort_counter <= abort_number); - // Everything's busy -> create new aggregation executor (growing pool) OR - // return empty optional - if (instance()[gpu_id].growing_pool) { - instance()[gpu_id].aggregation_executor_pool.emplace_back( - instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); - instance()[gpu_id].current_interface = - instance()[gpu_id].aggregation_executor_pool.size() - 1; - assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); - ret = instance()[gpu_id] - .aggregation_executor_pool[instance()[gpu_id].current_interface] - .request_executor_slice(); - assert(ret.has_value()); // fresh executor -- should always have slices - // available - } - return ret; - } - -private: - std::deque> aggregation_executor_pool; - std::atomic current_interface{0}; - size_t slices_per_executor; - Aggregated_Executor_Modes mode; - bool growing_pool{true}; - -private: - /// Required for dealing with adding elements to the deque of - /// aggregated_executors - cppuddle::aggregation_mutex_t pool_mutex; - /// Global access instance - static std::unique_ptr& instance(void) { - static std::unique_ptr pool_instances{ - new aggregation_pool[cppuddle::max_number_gpus]}; - return pool_instances; - } - static inline size_t number_devices = 1; - static inline bool is_initialized = false; - aggregation_pool() = default; - -public: - ~aggregation_pool() = default; - // Bunch of constructors we don't need - aggregation_pool(aggregation_pool const &other) = delete; - aggregation_pool &operator=(aggregation_pool const &other) = delete; - aggregation_pool(aggregation_pool &&other) = delete; - aggregation_pool &operator=(aggregation_pool &&other) = delete; -}; +using aggregation_pool = + cppuddle::kernel_aggregation::aggregation_pool; #endif diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp index dac9f170..49a6d42d 100644 --- a/include/cppuddle/executor_recycling/executor_pools_interface.hpp +++ b/include/cppuddle/executor_recycling/executor_pools_interface.hpp @@ -26,7 +26,7 @@ template using executor_interface = detail::executor_interface; -} -} +} // end namespace executor_recycling +} // end namespace cppuddle #endif diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp new file mode 100644 index 00000000..fd5a8e77 --- /dev/null +++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp @@ -0,0 +1,1161 @@ +// Copyright (c) 2022-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef KERNEL_AGGREGATION_MANAGEMENT_HPP +#define KERNEL_AGGREGATION_MANAGEMENT_HPP + +#ifndef CPPUDDLE_HAVE_HPX +#error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON" +#endif + +#include +//#define DEBUG_AGGREGATION_CALLS 1 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) +// required for defining type traits using cuda executor as underlying +// aggregation executors +#include +#endif + +#include +#include + +#include "../include/buffer_manager.hpp" +#include "../include/stream_manager.hpp" +#include "cppuddle/common/config.hpp" + +#ifndef CPPUDDLE_HAVE_HPX_MUTEX +#pragma message \ + "Work aggregation will use hpx::mutex internally, despite CPPUDDLE_WITH_HPX_MUTEX=OFF" +#pragma message \ + "Consider using CPPUDDLE_WITH_HPX_MUTEX=ON, to make the rest of CPPuddle also use hpx::mutex" +#endif +namespace cppuddle { +namespace kernel_aggregation { + using aggregation_mutex_t = hpx::mutex; + +//=============================================================================== +//=============================================================================== +// Helper functions/classes + +/// Constructs a tuple with copies (to store temporaries in aggregated function +/// calls) yet also supporting references (on the users own risk...) +template +std::tuple make_tuple_supporting_references(Ts &&...ts) { + return std::tuple{std::forward(ts)...}; +} + +/// Print some specific values that we can, but don't bother for most types +/// (such as vector) +template std::string print_if_possible(T val) { + if constexpr (std::is_convertible_v) { + return val; + } else if constexpr (std::is_integral_v || std::is_floating_point_v) { + return std::to_string(val); + } else if constexpr (std::is_pointer_v) { + // Pretty printing pointer sort of only works well with %p + // TODO Try using std::format as soon as we can move to C++20 + std::unique_ptr debug_string(new char[128]()); + snprintf(debug_string.get(), 128, "%p", val); + return std::string(debug_string.get()); + } else { + return std::string("cannot print value"); + } +} + +/// Helper class for the helper class that prints tuples -- do not use this +/// directly +template +void print_tuple(const TupType &_tup, std::index_sequence) { + (..., (hpx::cout << (I == 0 ? "" : ", ") + << print_if_possible(std::get(_tup)))); +} + +/// Helper class for printing tuples (first component should be a function +/// pointer, remaining components the function arguments) +template void print_tuple(const std::tuple &_tup) { + // Use pointer and sprintf as boost::format refused to NOT cast the pointer + // address to 1... + // TODO Try using std::format as soon as we can move to C++20 + std::unique_ptr debug_string(new char[128]()); + snprintf(debug_string.get(), 128, "Function address: %p -- Arguments: (", + std::get<0>(_tup)); + hpx::cout << debug_string.get(); + print_tuple(_tup, std::make_index_sequence()); + hpx::cout << ")"; +} + +//=============================================================================== +//=============================================================================== +template +void exec_post_wrapper(Executor & exec, F &&f, Ts &&...ts) { + hpx::apply(exec, std::forward(f), std::forward(ts)...); +} + +template +hpx::lcos::future exec_async_wrapper(Executor & exec, F &&f, Ts &&...ts) { + return hpx::async(exec, std::forward(f), std::forward(ts)...); +} + +/// Manages the launch conditions for aggregated function calls +/// type/value-errors +/** Launch conditions: All slice executors must have called the same function + * (tracked by future all_slices_ready) + * AND + * Previous aggregated_function_call on the same Executor must have been + * launched (tracked by future stream_future) + * All function calls received from the slice executors are checked if they + * match the first one in both types and values (throws exception otherwise) + */ + +template class aggregated_function_call { +private: + std::atomic slice_counter = 0; + + /// Promise to be set when all slices have visited this function call + /* hpx::lcos::local::promise slices_ready_promise; */ + /// Tracks if all slices have visited this function call + /* hpx::lcos::future all_slices_ready = slices_ready_promise.get_future(); */ + /// How many slices can we expect? + const size_t number_slices; + const bool async_mode; + + Executor &underlying_executor; + +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) +#pragma message \ + "Building slow work aggegator build with additional runtime checks! Build with NDEBUG defined for fast build..." + /// Stores the function call of the first slice as reference for error + /// checking + std::any function_tuple; + /// Stores the string of the first function call for debug output + std::string debug_type_information; + aggregation_mutex_t debug_mut; +#endif + + std::vector> potential_async_promises{}; + +public: + aggregated_function_call(const size_t number_slices, bool async_mode, Executor &exec) + : number_slices(number_slices), async_mode(async_mode), underlying_executor(exec) { + if (async_mode) + potential_async_promises.resize(number_slices); + } + ~aggregated_function_call(void) { + // All slices should have done this call + assert(slice_counter == number_slices); + // assert(!all_slices_ready.valid()); + } + /// Returns true if all required slices have visited this point + bool sync_aggregation_slices(hpx::lcos::future &stream_future) { + assert(!async_mode); + assert(potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + if (local_counter == number_slices - 1) { + return true; + } + else return false; + } + template + void post_when(hpx::lcos::future &stream_future, F &&f, Ts &&...ts) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + // needed for concurrent access to function_tuple and debug_type_information + // Not required for normal use + std::lock_guard guard(debug_mut); +#endif + assert(!async_mode); + assert(potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + + if (local_counter == 0) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto tmp_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + function_tuple = tmp_tuple; + debug_type_information = typeid(decltype(tmp_tuple)).name(); +#endif + + } else { + // + // This scope checks if both the type and the values of the current call + // match the original call To be used in debug build... + // +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto comparison_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + try { + auto orig_call_tuple = + std::any_cast(function_tuple); + if (comparison_tuple != orig_call_tuple) { + throw std::runtime_error( + "Values of post function arguments (or function " + "itself) do not match "); + } + } catch (const std::bad_any_cast &e) { + hpx::cout + << "\nMismatched types error in aggregated post call of executor " + << ": " << e.what() << "\n"; + hpx::cout << "Expected types:\t\t " + << boost::core::demangle(debug_type_information.c_str()); + hpx::cout << "\nGot types:\t\t " + << boost::core::demangle( + typeid(decltype(comparison_tuple)).name()) + << "\n" + << std::endl; + // throw; + } catch (const std::runtime_error &e) { + hpx::cout + << "\nMismatched values error in aggregated post call of executor " + << ": " << e.what() << std::endl; + hpx::cout << "Types (matched):\t " + << boost::core::demangle(debug_type_information.c_str()); + auto orig_call_tuple = + std::any_cast(function_tuple); + hpx::cout << "\nExpected values:\t "; + print_tuple(orig_call_tuple); + hpx::cout << "\nGot values:\t\t "; + print_tuple(comparison_tuple); + hpx::cout << std::endl << std::endl; + // throw; + } +#endif + } + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + // Check exit criteria: Launch function call continuation by setting the + // slices promise + if (local_counter == number_slices - 1) { + exec_post_wrapper(underlying_executor, std::forward(f), std::forward(ts)...); + //slices_ready_promise.set_value(); + } + } + template + hpx::lcos::future async_when(hpx::lcos::future &stream_future, + F &&f, Ts &&...ts) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + // needed for concurrent access to function_tuple and debug_type_information + // Not required for normal use + std::lock_guard guard(debug_mut); +#endif + assert(async_mode); + assert(!potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + if (local_counter == 0) { +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto tmp_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + function_tuple = tmp_tuple; + debug_type_information = typeid(decltype(tmp_tuple)).name(); +#endif + } else { + // + // This scope checks if both the type and the values of the current call + // match the original call To be used in debug build... + // +#if !(defined(NDEBUG)) && defined(DEBUG_AGGREGATION_CALLS) + auto comparison_tuple = + make_tuple_supporting_references(f, std::forward(ts)...); + try { + auto orig_call_tuple = + std::any_cast(function_tuple); + if (comparison_tuple != orig_call_tuple) { + throw std::runtime_error( + "Values of async function arguments (or function " + "itself) do not match "); + } + } catch (const std::bad_any_cast &e) { + hpx::cout + << "\nMismatched types error in aggregated async call of executor " + << ": " << e.what() << "\n"; + hpx::cout << "Expected types:\t\t " + << boost::core::demangle(debug_type_information.c_str()); + hpx::cout << "\nGot types:\t\t " + << boost::core::demangle( + typeid(decltype(comparison_tuple)).name()) + << "\n" + << std::endl; + // throw; + } catch (const std::runtime_error &e) { + hpx::cout + << "\nMismatched values error in aggregated async call of executor " + << ": " << e.what() << std::endl; + hpx::cout << "Types (matched):\t " + << boost::core::demangle(debug_type_information.c_str()); + auto orig_call_tuple = + std::any_cast(function_tuple); + hpx::cout << "\nExpected values:\t "; + print_tuple(orig_call_tuple); + hpx::cout << "\nGot values:\t\t "; + print_tuple(comparison_tuple); + hpx::cout << std::endl << std::endl; + // throw; + } +#endif + } + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + assert(potential_async_promises.size() == number_slices); + hpx::lcos::future ret_fut = + potential_async_promises[local_counter].get_future(); + if (local_counter == number_slices - 1) { + /* slices_ready_promise.set_value(); */ + auto fut = exec_async_wrapper( + underlying_executor, std::forward(f), std::forward(ts)...); + fut.then([this](auto &&fut) { + for (auto &promise : potential_async_promises) { + promise.set_value(); + } + }); + } + // Check exit criteria: Launch function call continuation by setting the + // slices promise + return ret_fut; + } + template + hpx::lcos::shared_future wrap_async(hpx::lcos::future &stream_future, + F &&f, Ts &&...ts) { + assert(async_mode); + assert(!potential_async_promises.empty()); + const size_t local_counter = slice_counter++; + assert(local_counter < number_slices); + assert(slice_counter < number_slices + 1); + assert(potential_async_promises.size() == number_slices); + hpx::lcos::shared_future ret_fut = + potential_async_promises[local_counter].get_shared_future(); + if (local_counter == number_slices - 1) { + auto fut = f(std::forward(ts)...); + fut.then([this](auto &&fut) { + // TODO just use one promise + for (auto &promise : potential_async_promises) { + promise.set_value(); + } + }); + } + return ret_fut; + } + // We need to be able to copy or no-except move for std::vector.. + aggregated_function_call(const aggregated_function_call &other) = default; + aggregated_function_call & + operator=(const aggregated_function_call &other) = default; + aggregated_function_call(aggregated_function_call &&other) = default; + aggregated_function_call & + operator=(aggregated_function_call &&other) = default; +}; + +//=============================================================================== +//=============================================================================== + +enum class aggregated_executor_modes { EAGER = 1, STRICT, ENDLESS }; +/// Declaration since the actual allocator is only defined after the Executors +template +class allocator_slice; + +/// Executor Class that aggregates function calls for specific kernels +/** Executor is not meant to be used directly. Instead it yields multiple + * executor_slice objects. These serve as interfaces. Slices from the same + * Aggregated_Executor are meant to execute the same function calls but on + * different data (i.e. different tasks) + */ +template class Aggregated_Executor { +private: + //=============================================================================== + // Misc private avariables: + // + std::atomic slices_exhausted; + + std::atomic executor_slices_alive; + std::atomic buffers_in_use; + std::atomic dealloc_counter; + + const aggregated_executor_modes mode; + const size_t max_slices; + std::atomic current_slices; + /// Wrapper to the executor interface from the stream pool + /// Automatically hooks into the stream_pools reference counting + /// for cpu/gpu load balancing + std::unique_ptr>> executor_wrapper; + +public: + size_t gpu_id; + // Subclasses + + /// Slice class - meant as a scope interface to the aggregated executor + class executor_slice { + public: + Aggregated_Executor &parent; + private: + /// Executor is a slice of this aggregated_executor + /// How many functions have been called - required to enforce sequential + /// behaviour of kernel launches + size_t launch_counter{0}; + size_t buffer_counter{0}; + bool notify_parent_about_destruction{true}; + + public: + /// How many slices are there overall - required to check the launch + /// criteria + const size_t number_slices; + const size_t id; + using executor_t = Executor; + executor_slice(Aggregated_Executor &parent, const size_t slice_id, + const size_t number_slices) + : parent(parent), notify_parent_about_destruction(true), + number_slices(number_slices), id(slice_id) { + } + ~executor_slice(void) { + // Don't notify parent if we moved away from this executor_slice + if (notify_parent_about_destruction) { + // Executor should be done by the time of destruction + // -> check here before notifying parent + + // parent still in execution mode? + assert(parent.slices_exhausted == true); + // all kernel launches done? + assert(launch_counter == parent.function_calls.size()); + // Notifiy parent that this aggregation slice is one + parent.reduce_usage_counter(); + } + } + executor_slice(const executor_slice &other) = delete; + executor_slice &operator=(const executor_slice &other) = delete; + executor_slice(executor_slice &&other) + : parent(other.parent), launch_counter(std::move(other.launch_counter)), + buffer_counter(std::move(other.buffer_counter)), + number_slices(std::move(other.number_slices)), + id(std::move(other.id)) { + other.notify_parent_about_destruction = false; + } + executor_slice &operator=(executor_slice &&other) { + parent = other.parent; + launch_counter = std::move(other.launch_counter); + buffer_counter = std::move(other.buffer_counter); + number_slices = std::move(other.number_slices); + id = std::move(other.id); + other.notify_parent_about_destruction = false; + } + template + allocator_slice make_allocator() { + return allocator_slice(*this); + } + bool sync_aggregation_slices() { + assert(parent.slices_exhausted == true); + auto ret = parent.sync_aggregation_slices(launch_counter); + launch_counter++; + return ret; + } + template void post(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + parent.post(launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + } + template + hpx::lcos::future async(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + hpx::lcos::future ret_fut = parent.async( + launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + return ret_fut; + } + + // OneWay Execution + template + friend decltype(auto) tag_invoke(hpx::parallel::execution::post_t, + executor_slice& exec, F&& f, Ts&&... ts) + { + return exec.post(std::forward(f), std::forward(ts)...); + } + + // TwoWay Execution + template + friend decltype(auto) tag_invoke( + hpx::parallel::execution::async_execute_t, executor_slice& exec, + F&& f, Ts&&... ts) + { + return exec.async( + std::forward(f), std::forward(ts)...); + } + + template + hpx::lcos::shared_future wrap_async(F &&f, Ts &&...ts) { + // we should only execute function calls once all slices + // have been given away (-> Executor Slices start) + assert(parent.slices_exhausted == true); + hpx::lcos::shared_future ret_fut = parent.wrap_async( + launch_counter, std::forward(f), std::forward(ts)...); + launch_counter++; + return ret_fut; + } + + /// Get new aggregated buffer (might have already been allocated been + /// allocated by different slice) + template T *get(const size_t size) { + assert(parent.slices_exhausted == true); + T *aggregated_buffer = + parent.get(size, buffer_counter); + buffer_counter++; + assert(buffer_counter > 0); + return aggregated_buffer; + } + + Executor& get_underlying_executor(void) { + assert(parent.executor_wrapper); + return *(parent.executor_wrapper); + } + }; + + // deprecated name... + /* using Executor_Slice = executor_slice; */ + + //=============================================================================== + + hpx::lcos::local::promise slices_full_promise; + /// Promises with the slice executors -- to be set when the starting criteria + /// is met + std::vector> executor_slices; + /// List of aggregated function calls - function will be launched when all + /// slices have called it + std::deque> function_calls; + /// For synchronizing the access to the function calls list + aggregation_mutex_t mut; + + /// Data entry for a buffer allocation: void* pointer, size_t for + /// buffer-size, atomic for the slice counter, location_id, gpu_id + using buffer_entry_t = + std::tuple, bool, const size_t, size_t>; + /// Keeps track of the aggregated buffer allocations done in all the slices + std::deque buffer_allocations; + /// Map pointer to deque index for fast access in the deallocations + std::unordered_map buffer_allocations_map; + /// For synchronizing the access to the buffer_allocations + aggregation_mutex_t buffer_mut; + std::atomic buffer_counter = 0; + + /// Get new buffer OR get buffer already allocated by different slice + template + T *get(const size_t size, const size_t slice_alloc_counter) { + assert(slices_exhausted == true); + assert(executor_wrapper); + assert(executor_slices_alive == true); + // Add aggreated buffer entry in case it hasn't happened yet for this call + // First: Check if it already has happened + if (buffer_counter <= slice_alloc_counter) { + // we might be the first! Lock... + std::lock_guard guard(buffer_mut); + // ... and recheck + if (buffer_counter <= slice_alloc_counter) { + constexpr bool manage_content_lifetime = false; + buffers_in_use = true; + + // Default location -- useful for GPU builds as we otherwise create way too + // many different buffers for different aggregation sizes on different GPUs + /* size_t location_id = gpu_id * instances_per_gpu; */ + // Use integer conversion to only use 0 16 32 ... as buckets + size_t location_id = ((hpx::get_worker_thread_num() % cppuddle::number_instances) / 16) * 16; +#ifdef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS + if (max_slices == 1) { + // get prefered location: aka the current hpx threads location + // Usually handy for CPU builds where we want to use the buffers + // close to the current CPU core + /* location_id = (hpx::get_worker_thread_num() / instances_per_gpu) * instances_per_gpu; */ + /* location_id = (gpu_id) * instances_per_gpu; */ + // division makes sure that we always use the same instance to store our gpu buffers. + } +#endif + // Get shiny and new buffer that will be shared between all slices + // Buffer might be recycled from previous allocations by the + // buffer_interface... + T *aggregated_buffer = + cppuddle::memory_recycling::detail::buffer_interface::get< + T, Host_Allocator>(size, manage_content_lifetime, location_id, + gpu_id); + // Create buffer entry for this buffer + buffer_allocations.emplace_back(static_cast(aggregated_buffer), + size, 1, true, location_id, gpu_id); + +#ifndef NDEBUG + // if previousely used the buffer should not be in usage anymore + const auto exists = buffer_allocations_map.count( + static_cast(aggregated_buffer)); + if (exists > 0) { + const auto previous_usage_id = + buffer_allocations_map[static_cast(aggregated_buffer)]; + const auto &valid = + std::get<3>(buffer_allocations[previous_usage_id]); + assert(!valid); + } +#endif + buffer_allocations_map.insert_or_assign(static_cast(aggregated_buffer), + buffer_counter); + + assert (buffer_counter == slice_alloc_counter); + buffer_counter = buffer_allocations.size(); + + // Return buffer + return aggregated_buffer; + } + } + assert(buffers_in_use == true); + assert(std::get<3>(buffer_allocations[slice_alloc_counter])); // valid + assert(std::get<2>(buffer_allocations[slice_alloc_counter]) >= 1); + + // Buffer entry should already exist: + T *aggregated_buffer = static_cast( + std::get<0>(buffer_allocations[slice_alloc_counter])); + // Error handling: Size is wrong? + assert(size == std::get<1>(buffer_allocations[slice_alloc_counter])); + // Notify that one more slice has visited this buffer allocation + std::get<2>(buffer_allocations[slice_alloc_counter])++; + return aggregated_buffer; + } + + /// Notify buffer list that one slice is done with the buffer + template + void mark_unused(T *p, const size_t size) { + assert(slices_exhausted == true); + assert(executor_wrapper); + + void *ptr_key = static_cast(p); + size_t slice_alloc_counter = buffer_allocations_map[p]; + + assert(slice_alloc_counter < buffer_allocations.size()); + /*auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, valid] = + buffer_allocations[slice_alloc_counter];*/ + auto buffer_pointer_void = std::get<0>(buffer_allocations[slice_alloc_counter]); + const auto buffer_size = std::get<1>(buffer_allocations[slice_alloc_counter]); + auto &buffer_allocation_counter = std::get<2>(buffer_allocations[slice_alloc_counter]); + auto &valid = std::get<3>(buffer_allocations[slice_alloc_counter]); + const auto &location_id = std::get<4>(buffer_allocations[slice_alloc_counter]); + const auto &gpu_id = std::get<5>(buffer_allocations[slice_alloc_counter]); + assert(valid); + T *buffer_pointer = static_cast(buffer_pointer_void); + + assert(buffer_size == size); + assert(p == buffer_pointer); + // assert(buffer_pointer == p || buffer_pointer == nullptr); + // Slice is done with this buffer + buffer_allocation_counter--; + // Check if all slices are done with this buffer? + if (buffer_allocation_counter == 0) { + // Yes! "Deallocate" by telling the recylcer the buffer is fit for reusage + std::lock_guard guard(buffer_mut); + // Only mark unused if another buffer has not done so already (and marked + // it as invalid) + if (valid) { + assert(buffers_in_use == true); + cppuddle::memory_recycling::detail::buffer_interface::mark_unused< + T, Host_Allocator>(buffer_pointer, buffer_size, location_id, + gpu_id); + // mark buffer as invalid to prevent any other slice from marking the + // buffer as unused + valid = false; + + const size_t current_deallocs = ++dealloc_counter; + if (current_deallocs == buffer_counter) { + std::lock_guard guard(mut); + buffers_in_use = false; + if (!executor_slices_alive && !buffers_in_use) { + slices_exhausted = false; + // Release executor + executor_wrapper.reset(nullptr); + } + } + } + } + } + + //=============================================================================== + // Public Interface +public: + hpx::lcos::future current_continuation; + hpx::lcos::future last_stream_launch_done; + std::atomic overall_launch_counter = 0; + + /// Only meant to be accessed by the slice executors + bool sync_aggregation_slices(const size_t slice_launch_counter) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, false, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].sync_aggregation_slices( + last_stream_launch_done); + } + } + + return function_calls[slice_launch_counter].sync_aggregation_slices( + last_stream_launch_done); + } + + /// Only meant to be accessed by the slice executors + template + void post(const size_t slice_launch_counter, F &&f, Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, false, *executor_wrapper); + overall_launch_counter = function_calls.size(); + function_calls[slice_launch_counter].post_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + return; + } + } + + function_calls[slice_launch_counter].post_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + return; + } + + /// Only meant to be accessed by the slice executors + template + hpx::lcos::future async(const size_t slice_launch_counter, F &&f, + Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, true, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].async_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + } + + return function_calls[slice_launch_counter].async_when( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + /// Only meant to be accessed by the slice executors + template + hpx::lcos::shared_future wrap_async(const size_t slice_launch_counter, F &&f, + Ts &&...ts) { + std::lock_guard guard(mut); + assert(slices_exhausted == true); + assert(executor_wrapper); + // Add function call object in case it hasn't happened for this launch yet + if (overall_launch_counter <= slice_launch_counter) { + /* std::lock_guard guard(mut); */ + if (overall_launch_counter <= slice_launch_counter) { + function_calls.emplace_back(current_slices, true, *executor_wrapper); + overall_launch_counter = function_calls.size(); + return function_calls[slice_launch_counter].wrap_async( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + } + + return function_calls[slice_launch_counter].wrap_async( + last_stream_launch_done, std::forward(f), std::forward(ts)...); + } + + bool slice_available(void) { + std::lock_guard guard(mut); + return !slices_exhausted; + } + + std::optional> request_executor_slice() { + std::lock_guard guard(mut); + if (!slices_exhausted) { + const size_t local_slice_id = ++current_slices; + if (local_slice_id == 1) { + // Cleanup leftovers from last run if any + // TODO still required? Should be clean here already + function_calls.clear(); + overall_launch_counter = 0; + std::lock_guard guard(buffer_mut); +#ifndef NDEBUG + for (const auto &buffer_entry : buffer_allocations) { + const auto &[buffer_pointer_any, buffer_size, + buffer_allocation_counter, valid, location_id, device_id] = + buffer_entry; + assert(!valid); + } +#endif + buffer_allocations.clear(); + buffer_allocations_map.clear(); + buffer_counter = 0; + + assert(executor_slices_alive == false); + assert(buffers_in_use == false); + executor_slices_alive = true; + buffers_in_use = false; + dealloc_counter = 0; + + if (mode == aggregated_executor_modes::STRICT ) { + slices_full_promise = hpx::lcos::local::promise{}; + } + } + + // Create Executor Slice future -- that will be returned later + hpx::lcos::future ret_fut; + if (local_slice_id < max_slices) { + executor_slices.emplace_back(hpx::lcos::local::promise{}); + ret_fut = + executor_slices[local_slice_id - 1].get_future(); + } else { + launched_slices = current_slices; + ret_fut = hpx::make_ready_future(executor_slice{*this, + executor_slices.size(), launched_slices}); + } + + // Are we the first slice? If yes, add continuation set the + // executor_slice + // futures to ready if the launch conditions are met + if (local_slice_id == 1) { + // Redraw executor + assert(!executor_wrapper); + stream_pool::select_device>(gpu_id); + executor_wrapper.reset( + new stream_interface>(gpu_id)); + // Renew promise that all slices will be ready as the primary launch + // criteria... + hpx::lcos::shared_future fut; + if (mode == aggregated_executor_modes::EAGER || + mode == aggregated_executor_modes::ENDLESS) { + // Fallback launch condidtion: Launch as soon as the underlying stream + // is ready + /* auto slices_full_fut = slices_full_promise.get_future(); */ + stream_pool::select_device>(gpu_id); + auto exec_fut = (*executor_wrapper).get_future(); + /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */ + fut = std::move(exec_fut); + } else { + auto slices_full_fut = slices_full_promise.get_shared_future(); + // Just use the slices launch condition + fut = std::move(slices_full_fut); + } + // Launch all executor slices within this continuation + current_continuation = fut.then([this](auto &&fut) { + std::lock_guard guard(mut); + slices_exhausted = true; + launched_slices = current_slices; + size_t id = 0; + for (auto &slice_promise : executor_slices) { + slice_promise.set_value( + executor_slice{*this, id, launched_slices}); + id++; + } + executor_slices.clear(); + }); + } + if (local_slice_id >= max_slices && + mode != aggregated_executor_modes::ENDLESS) { + slices_exhausted = true; // prevents any more threads from entering + // before the continuation is launched + /* launched_slices = current_slices; */ + /* size_t id = 0; */ + /* for (auto &slice_promise : executor_slices) { */ + /* slice_promise.set_value( */ + /* executor_slice{*this, id, launched_slices}); */ + /* id++; */ + /* } */ + /* executor_slices.clear(); */ + if (mode == aggregated_executor_modes::STRICT ) { + slices_full_promise.set_value(); // Trigger slices launch condition continuation + } + // that continuation will set all executor slices so far handed out to ready + } + return ret_fut; + } else { + // Return empty optional as failure + return std::optional>{}; + } + } + size_t launched_slices; + void reduce_usage_counter(void) { + /* std::lock_guard guard(mut); */ + assert(slices_exhausted == true); + assert(executor_wrapper); + assert(executor_slices_alive == true); + assert(launched_slices >= 1); + assert(current_slices >= 0 && current_slices <= launched_slices); + const size_t local_slice_id = --current_slices; + // Last slice goes out scope? + if (local_slice_id == 0) { + // Mark executor fit for reusage + std::lock_guard guard(mut); + executor_slices_alive = false; + if (!executor_slices_alive && !buffers_in_use) { + // Release executor + slices_exhausted = false; + executor_wrapper.reset(nullptr); + } + } + } + ~Aggregated_Executor(void) { + + assert(current_slices == 0); + assert(executor_slices_alive == false); + assert(buffers_in_use == false); + + if (mode != aggregated_executor_modes::STRICT ) { + slices_full_promise.set_value(); // Trigger slices launch condition continuation + } + + // Cleanup leftovers from last run if any + function_calls.clear(); + overall_launch_counter = 0; +#ifndef NDEBUG + for (const auto &buffer_entry : buffer_allocations) { + const auto &[buffer_pointer_any, buffer_size, buffer_allocation_counter, + valid, location_id, device_id] = buffer_entry; + assert(!valid); + } +#endif + buffer_allocations.clear(); + buffer_allocations_map.clear(); + buffer_counter = 0; + + assert(buffer_allocations.empty()); + assert(buffer_allocations_map.empty()); + } + + Aggregated_Executor(const size_t number_slices, + aggregated_executor_modes mode, const size_t gpu_id = 0) + : max_slices(number_slices), current_slices(0), slices_exhausted(false), + dealloc_counter(0), mode(mode), executor_slices_alive(false), + buffers_in_use(false), gpu_id(gpu_id), + executor_wrapper(nullptr), + current_continuation(hpx::make_ready_future()), + last_stream_launch_done(hpx::make_ready_future()) {} + // Not meant to be copied or moved + Aggregated_Executor(const Aggregated_Executor &other) = delete; + Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete; + Aggregated_Executor(Aggregated_Executor &&other) = delete; + Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete; +}; + +template +class allocator_slice { +private: + typename Aggregated_Executor::executor_slice &executor_reference; + Aggregated_Executor &executor_parent; + +public: + using value_type = T; + allocator_slice( + typename Aggregated_Executor::executor_slice &executor) + : executor_reference(executor), executor_parent(executor.parent) {} + template + explicit allocator_slice( + allocator_slice const &) noexcept {} + T *allocate(std::size_t n) { + T *data = executor_reference.template get(n); + return data; + } + void deallocate(T *p, std::size_t n) { + /* executor_reference.template mark_unused(p, n); */ + executor_parent.template mark_unused(p, n); + } + template + inline void construct(T *p, Args... args) noexcept { + // Do nothing here - we reuse the content of the last owner + } + void destroy(T *p) { + // Do nothing here - Contents will be destroyed when the buffer manager is + // destroyed, not before + } +}; +template +constexpr bool +operator==(allocator_slice const &, + allocator_slice const &) noexcept { + return false; +} +template +constexpr bool +operator!=(allocator_slice const &, + allocator_slice const &) noexcept { + return true; +} + +//=============================================================================== +//=============================================================================== +// Pool Strategy: + +template +class aggregation_pool { +public: + /// interface + template + static void init(size_t number_of_executors, size_t slices_per_executor, + aggregated_executor_modes mode, size_t num_devices = 1) { + if (is_initialized) { + throw std::runtime_error( + std::string("Trying to initialize cppuddle aggregation pool twice") + + " Agg pool name: " + std::string(kernelname)); + } + if (num_devices > cppuddle::max_number_gpus) { + throw std::runtime_error( + std::string( + "Trying to initialize aggregation with more devices than the " + "maximum number of GPUs given at compiletime") + + " Agg pool name: " + std::string(kernelname)); + } + number_devices = num_devices; + for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { + + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(instance()[gpu_id].aggregation_executor_pool.empty()); + for (int i = 0; i < number_of_executors; i++) { + instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, + mode, gpu_id); + } + instance()[gpu_id].slices_per_executor = slices_per_executor; + instance()[gpu_id].mode = mode; + } + is_initialized = true; + } + + /// Will always return a valid executor slice + static decltype(auto) request_executor_slice(void) { + if (!is_initialized) { + throw std::runtime_error( + std::string("Trying to use cppuddle aggregation pool without first calling init") + + " Agg poolname: " + std::string(kernelname)); + } + const size_t gpu_id = cppuddle::get_device_id(number_devices); + /* const size_t gpu_id = 1; */ + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(!instance()[gpu_id].aggregation_executor_pool.empty()); + std::optional::executor_slice>> + ret; + size_t local_id = (instance()[gpu_id].current_interface) % + instance()[gpu_id].aggregation_executor_pool.size(); + ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + // Expected case: current aggregation executor is free + if (ret.has_value()) { + return ret; + } + // current interface is bad -> find free one + size_t abort_counter = 0; + const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; + do { + local_id = (++(instance()[gpu_id].current_interface)) % // increment interface + instance()[gpu_id].aggregation_executor_pool.size(); + ret = + instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + if (ret.has_value()) { + return ret; + } + abort_counter++; + } while (abort_counter <= abort_number); + // Everything's busy -> create new aggregation executor (growing pool) OR + // return empty optional + if (instance()[gpu_id].growing_pool) { + instance()[gpu_id].aggregation_executor_pool.emplace_back( + instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); + instance()[gpu_id].current_interface = + instance()[gpu_id].aggregation_executor_pool.size() - 1; + assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); + ret = instance()[gpu_id] + .aggregation_executor_pool[instance()[gpu_id].current_interface] + .request_executor_slice(); + assert(ret.has_value()); // fresh executor -- should always have slices + // available + } + return ret; + } + +private: + std::deque> aggregation_executor_pool; + std::atomic current_interface{0}; + size_t slices_per_executor; + aggregated_executor_modes mode; + bool growing_pool{true}; + +private: + /// Required for dealing with adding elements to the deque of + /// aggregated_executors + aggregation_mutex_t pool_mutex; + /// Global access instance + static std::unique_ptr& instance(void) { + static std::unique_ptr pool_instances{ + new aggregation_pool[cppuddle::max_number_gpus]}; + return pool_instances; + } + static inline size_t number_devices = 1; + static inline bool is_initialized = false; + aggregation_pool() = default; + +public: + ~aggregation_pool() = default; + // Bunch of constructors we don't need + aggregation_pool(aggregation_pool const &other) = delete; + aggregation_pool &operator=(aggregation_pool const &other) = delete; + aggregation_pool(aggregation_pool &&other) = delete; + aggregation_pool &operator=(aggregation_pool &&other) = delete; +}; + +} // namespace kernel_aggregation +} // namespace cppuddle + +namespace hpx { namespace parallel { namespace execution { + // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with + // compatible unlying executor types + /* template */ + /* struct is_one_way_executor::executor_slice> */ + /* : std::true_type */ + /* {}; */ + /* template */ + /* struct is_two_way_executor::executor_slice> */ + /* : std::true_type */ + /* {}; */ + +#if defined(HPX_HAVE_CUDA) || defined(HPX_HAVE_HIP) + // Workaround for the meantime: Manually create traits for compatible types: +template <> +struct is_one_way_executor< + typename cppuddle::kernel_aggregation::Aggregated_Executor< + hpx::cuda::experimental::cuda_executor>::executor_slice> + : std::true_type {}; +template <> +struct is_two_way_executor< + typename cppuddle::kernel_aggregation::Aggregated_Executor< + hpx::cuda::experimental::cuda_executor>::executor_slice> + : std::true_type {}; +#endif +}}} + +#endif From 54373c3e7d56366080850cad0e1f23d3dae4ec07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Fri, 8 Mar 2024 22:45:02 +0100 Subject: [PATCH 11/19] Work aggregation interface refactoring --- include/aggregation_manager.hpp | 22 ++- .../detail/aggregation_executor_pools.hpp | 134 ++++++++++++++ .../aggregation_executors_and_allocators.hpp} | 163 +++--------------- .../kernel_aggregation_interface.hpp | 34 ++++ tests/work_aggregation_cpu_triad.cpp | 16 +- 5 files changed, 215 insertions(+), 154 deletions(-) create mode 100644 include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp rename include/cppuddle/kernel_aggregation/{kernel_aggregation_management.hpp => detail/aggregation_executors_and_allocators.hpp} (87%) create mode 100644 include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index 030150f9..bb0fd83f 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -6,21 +6,29 @@ #ifndef AGGREGATION_MANAGER_HPP #define AGGREGATION_MANAGER_HPP -#include "cppuddle/kernel_aggregation/kernel_aggregation_management.hpp" +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" -using Aggregated_Executor_Modes = - cppuddle::kernel_aggregation::aggregated_executor_modes; +using Aggregated_Executor_Modes + [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor_modes " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::aggregated_executor_modes; template -using Allocator_Slice = +using Allocator_Slice + [[deprecated("Use cppuddle::kernel_aggregation::allocator_slice " + "from kernel_aggregation_interface.hpp instead")]] = cppuddle::kernel_aggregation::allocator_slice; template -using Aggregated_Executor = - cppuddle::kernel_aggregation::Aggregated_Executor; +using Aggregated_Executor + [[deprecated("Use cppuddle::kernel_aggregation::aggregated_executor " + "from kernel_aggregation_interface.hpp instead")]] = + cppuddle::kernel_aggregation::aggregated_executor; template -using aggregation_pool = +using aggregation_pool + [[deprecated("Use cppuddle::kernel_aggregation::aggregation_pool " + "from kernel_aggregation_interface.hpp instead")]] = cppuddle::kernel_aggregation::aggregation_pool; diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp new file mode 100644 index 00000000..b9d456cc --- /dev/null +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp @@ -0,0 +1,134 @@ +// Copyright (c) 2022-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp" + +#ifndef AGGREGATION_EXECUTOR_POOL_HPP +#define AGGREGATION_EXECUTOR_POOL_HPP + +namespace cppuddle { +namespace kernel_aggregation { +namespace detail { + +template +class aggregation_pool { +public: + /// interface + template + static void init(size_t number_of_executors, size_t slices_per_executor, + aggregated_executor_modes mode, size_t num_devices = 1) { + if (is_initialized) { + throw std::runtime_error( + std::string("Trying to initialize cppuddle aggregation pool twice") + + " Agg pool name: " + std::string(kernelname)); + } + if (num_devices > cppuddle::max_number_gpus) { + throw std::runtime_error( + std::string( + "Trying to initialize aggregation with more devices than the " + "maximum number of GPUs given at compiletime") + + " Agg pool name: " + std::string(kernelname)); + } + number_devices = num_devices; + for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { + + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(instance()[gpu_id].aggregation_executor_pool.empty()); + for (int i = 0; i < number_of_executors; i++) { + instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, + mode, gpu_id); + } + instance()[gpu_id].slices_per_executor = slices_per_executor; + instance()[gpu_id].mode = mode; + } + is_initialized = true; + } + + /// Will always return a valid executor slice + static decltype(auto) request_executor_slice(void) { + if (!is_initialized) { + throw std::runtime_error( + std::string("Trying to use cppuddle aggregation pool without first calling init") + + " Agg poolname: " + std::string(kernelname)); + } + const size_t gpu_id = cppuddle::get_device_id(number_devices); + /* const size_t gpu_id = 1; */ + std::lock_guard guard(instance()[gpu_id].pool_mutex); + assert(!instance()[gpu_id].aggregation_executor_pool.empty()); + std::optional::executor_slice>> + ret; + size_t local_id = (instance()[gpu_id].current_interface) % + instance()[gpu_id].aggregation_executor_pool.size(); + ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + // Expected case: current aggregation executor is free + if (ret.has_value()) { + return ret; + } + // current interface is bad -> find free one + size_t abort_counter = 0; + const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; + do { + local_id = (++(instance()[gpu_id].current_interface)) % // increment interface + instance()[gpu_id].aggregation_executor_pool.size(); + ret = + instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); + if (ret.has_value()) { + return ret; + } + abort_counter++; + } while (abort_counter <= abort_number); + // Everything's busy -> create new aggregation executor (growing pool) OR + // return empty optional + if (instance()[gpu_id].growing_pool) { + instance()[gpu_id].aggregation_executor_pool.emplace_back( + instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); + instance()[gpu_id].current_interface = + instance()[gpu_id].aggregation_executor_pool.size() - 1; + assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); + ret = instance()[gpu_id] + .aggregation_executor_pool[instance()[gpu_id].current_interface] + .request_executor_slice(); + assert(ret.has_value()); // fresh executor -- should always have slices + // available + } + return ret; + } + +private: + std::deque> aggregation_executor_pool; + std::atomic current_interface{0}; + size_t slices_per_executor; + aggregated_executor_modes mode; + bool growing_pool{true}; + +private: + /// Required for dealing with adding elements to the deque of + /// aggregated_executors + aggregation_mutex_t pool_mutex; + /// Global access instance + static std::unique_ptr& instance(void) { + static std::unique_ptr pool_instances{ + new aggregation_pool[cppuddle::max_number_gpus]}; + return pool_instances; + } + static inline size_t number_devices = 1; + static inline bool is_initialized = false; + aggregation_pool() = default; + +public: + ~aggregation_pool() = default; + // Bunch of constructors we don't need + aggregation_pool(aggregation_pool const &other) = delete; + aggregation_pool &operator=(aggregation_pool const &other) = delete; + aggregation_pool(aggregation_pool &&other) = delete; + aggregation_pool &operator=(aggregation_pool &&other) = delete; +}; + +} // namespace detail +} // namespace kernel_aggregation +} // namespace cppuddle + +#endif diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp similarity index 87% rename from include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp rename to include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp index fd5a8e77..5826c2c3 100644 --- a/include/cppuddle/kernel_aggregation/kernel_aggregation_management.hpp +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp @@ -3,8 +3,8 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef KERNEL_AGGREGATION_MANAGEMENT_HPP -#define KERNEL_AGGREGATION_MANAGEMENT_HPP +#ifndef AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP +#define AGGREGATION_EXECUTOR_AND_ALLOCATOR_HPP #ifndef CPPUDDLE_HAVE_HPX #error "Work aggregation allocators/executors require CPPUDDLE_WITH_HPX=ON" @@ -60,6 +60,7 @@ #endif namespace cppuddle { namespace kernel_aggregation { +namespace detail { using aggregation_mutex_t = hpx::mutex; //=============================================================================== @@ -381,10 +382,10 @@ class allocator_slice; /// Executor Class that aggregates function calls for specific kernels /** Executor is not meant to be used directly. Instead it yields multiple * executor_slice objects. These serve as interfaces. Slices from the same - * Aggregated_Executor are meant to execute the same function calls but on + * aggregated_executor are meant to execute the same function calls but on * different data (i.e. different tasks) */ -template class Aggregated_Executor { +template class aggregated_executor { private: //=============================================================================== // Misc private avariables: @@ -410,7 +411,7 @@ template class Aggregated_Executor { /// Slice class - meant as a scope interface to the aggregated executor class executor_slice { public: - Aggregated_Executor &parent; + aggregated_executor &parent; private: /// Executor is a slice of this aggregated_executor /// How many functions have been called - required to enforce sequential @@ -425,7 +426,7 @@ template class Aggregated_Executor { const size_t number_slices; const size_t id; using executor_t = Executor; - executor_slice(Aggregated_Executor &parent, const size_t slice_id, + executor_slice(aggregated_executor &parent, const size_t slice_id, const size_t number_slices) : parent(parent), notify_parent_about_destruction(true), number_slices(number_slices), id(slice_id) { @@ -536,7 +537,7 @@ template class Aggregated_Executor { }; // deprecated name... - /* using Executor_Slice = executor_slice; */ + using Executor_Slice [[deprectated("Renamed: Use executor_slice instead")]] = executor_slice; //=============================================================================== @@ -922,7 +923,7 @@ template class Aggregated_Executor { } } } - ~Aggregated_Executor(void) { + ~aggregated_executor(void) { assert(current_slices == 0); assert(executor_slices_alive == false); @@ -950,7 +951,7 @@ template class Aggregated_Executor { assert(buffer_allocations_map.empty()); } - Aggregated_Executor(const size_t number_slices, + aggregated_executor(const size_t number_slices, aggregated_executor_modes mode, const size_t gpu_id = 0) : max_slices(number_slices), current_slices(0), slices_exhausted(false), dealloc_counter(0), mode(mode), executor_slices_alive(false), @@ -959,22 +960,22 @@ template class Aggregated_Executor { current_continuation(hpx::make_ready_future()), last_stream_launch_done(hpx::make_ready_future()) {} // Not meant to be copied or moved - Aggregated_Executor(const Aggregated_Executor &other) = delete; - Aggregated_Executor &operator=(const Aggregated_Executor &other) = delete; - Aggregated_Executor(Aggregated_Executor &&other) = delete; - Aggregated_Executor &operator=(Aggregated_Executor &&other) = delete; + aggregated_executor(const aggregated_executor &other) = delete; + aggregated_executor &operator=(const aggregated_executor &other) = delete; + aggregated_executor(aggregated_executor &&other) = delete; + aggregated_executor &operator=(aggregated_executor &&other) = delete; }; template class allocator_slice { private: - typename Aggregated_Executor::executor_slice &executor_reference; - Aggregated_Executor &executor_parent; + typename aggregated_executor::executor_slice &executor_reference; + aggregated_executor &executor_parent; public: using value_type = T; allocator_slice( - typename Aggregated_Executor::executor_slice &executor) + typename aggregated_executor::executor_slice &executor) : executor_reference(executor), executor_parent(executor.parent) {} template explicit allocator_slice( @@ -1009,137 +1010,21 @@ operator!=(allocator_slice const &, return true; } -//=============================================================================== -//=============================================================================== -// Pool Strategy: - -template -class aggregation_pool { -public: - /// interface - template - static void init(size_t number_of_executors, size_t slices_per_executor, - aggregated_executor_modes mode, size_t num_devices = 1) { - if (is_initialized) { - throw std::runtime_error( - std::string("Trying to initialize cppuddle aggregation pool twice") + - " Agg pool name: " + std::string(kernelname)); - } - if (num_devices > cppuddle::max_number_gpus) { - throw std::runtime_error( - std::string( - "Trying to initialize aggregation with more devices than the " - "maximum number of GPUs given at compiletime") + - " Agg pool name: " + std::string(kernelname)); - } - number_devices = num_devices; - for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) { - - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(instance()[gpu_id].aggregation_executor_pool.empty()); - for (int i = 0; i < number_of_executors; i++) { - instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor, - mode, gpu_id); - } - instance()[gpu_id].slices_per_executor = slices_per_executor; - instance()[gpu_id].mode = mode; - } - is_initialized = true; - } - - /// Will always return a valid executor slice - static decltype(auto) request_executor_slice(void) { - if (!is_initialized) { - throw std::runtime_error( - std::string("Trying to use cppuddle aggregation pool without first calling init") + - " Agg poolname: " + std::string(kernelname)); - } - const size_t gpu_id = cppuddle::get_device_id(number_devices); - /* const size_t gpu_id = 1; */ - std::lock_guard guard(instance()[gpu_id].pool_mutex); - assert(!instance()[gpu_id].aggregation_executor_pool.empty()); - std::optional::executor_slice>> - ret; - size_t local_id = (instance()[gpu_id].current_interface) % - instance()[gpu_id].aggregation_executor_pool.size(); - ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - // Expected case: current aggregation executor is free - if (ret.has_value()) { - return ret; - } - // current interface is bad -> find free one - size_t abort_counter = 0; - const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1; - do { - local_id = (++(instance()[gpu_id].current_interface)) % // increment interface - instance()[gpu_id].aggregation_executor_pool.size(); - ret = - instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice(); - if (ret.has_value()) { - return ret; - } - abort_counter++; - } while (abort_counter <= abort_number); - // Everything's busy -> create new aggregation executor (growing pool) OR - // return empty optional - if (instance()[gpu_id].growing_pool) { - instance()[gpu_id].aggregation_executor_pool.emplace_back( - instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id); - instance()[gpu_id].current_interface = - instance()[gpu_id].aggregation_executor_pool.size() - 1; - assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480); - ret = instance()[gpu_id] - .aggregation_executor_pool[instance()[gpu_id].current_interface] - .request_executor_slice(); - assert(ret.has_value()); // fresh executor -- should always have slices - // available - } - return ret; - } - -private: - std::deque> aggregation_executor_pool; - std::atomic current_interface{0}; - size_t slices_per_executor; - aggregated_executor_modes mode; - bool growing_pool{true}; - -private: - /// Required for dealing with adding elements to the deque of - /// aggregated_executors - aggregation_mutex_t pool_mutex; - /// Global access instance - static std::unique_ptr& instance(void) { - static std::unique_ptr pool_instances{ - new aggregation_pool[cppuddle::max_number_gpus]}; - return pool_instances; - } - static inline size_t number_devices = 1; - static inline bool is_initialized = false; - aggregation_pool() = default; - -public: - ~aggregation_pool() = default; - // Bunch of constructors we don't need - aggregation_pool(aggregation_pool const &other) = delete; - aggregation_pool &operator=(aggregation_pool const &other) = delete; - aggregation_pool(aggregation_pool &&other) = delete; - aggregation_pool &operator=(aggregation_pool &&other) = delete; -}; - +} // namespace detail } // namespace kernel_aggregation } // namespace cppuddle + + namespace hpx { namespace parallel { namespace execution { // TODO Unfortunately does not work that way! Create trait that works for Executor Slices with // compatible unlying executor types /* template */ - /* struct is_one_way_executor::executor_slice> */ + /* struct is_one_way_executor::executor_slice> */ /* : std::true_type */ /* {}; */ /* template */ - /* struct is_two_way_executor::executor_slice> */ + /* struct is_two_way_executor::executor_slice> */ /* : std::true_type */ /* {}; */ @@ -1147,12 +1032,12 @@ namespace hpx { namespace parallel { namespace execution { // Workaround for the meantime: Manually create traits for compatible types: template <> struct is_one_way_executor< - typename cppuddle::kernel_aggregation::Aggregated_Executor< + typename cppuddle::kernel_aggregation::detail::aggregated_executor< hpx::cuda::experimental::cuda_executor>::executor_slice> : std::true_type {}; template <> struct is_two_way_executor< - typename cppuddle::kernel_aggregation::Aggregated_Executor< + typename cppuddle::kernel_aggregation::detail::aggregated_executor< hpx::cuda::experimental::cuda_executor>::executor_slice> : std::true_type {}; #endif diff --git a/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp new file mode 100644 index 00000000..c7a3b633 --- /dev/null +++ b/include/cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp @@ -0,0 +1,34 @@ +// Copyright (c) 2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef KERNEL_AGGREGATION_INTERFACE_HPP +#define KERNEL_AGGREGATION_INTERFACE_HPP + +#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp" +#include "cppuddle/kernel_aggregation/detail/aggregation_executor_pools.hpp" + +namespace cppuddle { +namespace kernel_aggregation { + +using aggregated_executor_modes = + cppuddle::kernel_aggregation::detail::aggregated_executor_modes; + +template +using allocator_slice = + cppuddle::kernel_aggregation::detail::allocator_slice; + +template +using aggregated_executor = + cppuddle::kernel_aggregation::detail::aggregated_executor; + +template +using aggregation_pool = + cppuddle::kernel_aggregation::detail::aggregation_pool; + +} // namespace kernel_aggregation +} // namespace cppuddle + +#endif diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp index fed34626..7bb455b0 100644 --- a/tests/work_aggregation_cpu_triad.cpp +++ b/tests/work_aggregation_cpu_triad.cpp @@ -5,9 +5,8 @@ #include #undef NDEBUG - -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" #include @@ -101,7 +100,8 @@ int hpx_main(int argc, char *argv[]) { size_t number_underlying_executors{0}; bool print_launch_counter{false}; std::string executor_type_string{}; - Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{ + cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::string filename{}; { try { @@ -161,11 +161,11 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } if (executor_type_string == "EAGER") { - executor_mode = Aggregated_Executor_Modes::EAGER; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER; } else if (executor_type_string == "STRICT") { - executor_mode = Aggregated_Executor_Modes::STRICT; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT; } else if (executor_type_string == "ENDLESS") { - executor_mode = Aggregated_Executor_Modes::ENDLESS; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS; } else { std::cerr << "ERROR: Unknown executor mode " << executor_type_string << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl; @@ -183,7 +183,7 @@ int hpx_main(int argc, char *argv[]) { stream_pool::init>( number_underlying_executors); static const char kernelname[] = "cpu_triad"; - using executor_pool = aggregation_pool>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); From e086ab666921e558169229dc1527f1409a258490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 01:21:01 +0100 Subject: [PATCH 12/19] Separate headers for underlying allocators --- .../cuda_recycling_allocators.hpp | 92 +-------------- .../detail/cuda_underlying_allocators.hpp | 101 +++++++++++++++++ .../detail/hip_underlying_allocators.hpp | 107 ++++++++++++++++++ .../detail/sycl_underlying_allocators.hpp | 74 ++++++++++++ .../hip_recycling_allocators.hpp | 98 +--------------- .../sycl_recycling_allocators.hpp | 64 +---------- 6 files changed, 290 insertions(+), 246 deletions(-) create mode 100644 include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp create mode 100644 include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp create mode 100644 include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp index 911948a3..7297955f 100644 --- a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp @@ -6,100 +6,14 @@ #ifndef CUDA_RECYCLING_ALLOCATORS_HPP #define CUDA_RECYCLING_ALLOCATORS_HPP -#include -#include -#include - #include "buffer_management_interface.hpp" +// import cuda_pinned_allocator and cuda_device_allocator +#include "detail/cuda_underlying_allocators.hpp" namespace cppuddle { namespace memory_recycling { -namespace detail { -/// Underlying host allocator for CUDA pinned memory -template struct cuda_pinned_allocator { - using value_type = T; - cuda_pinned_allocator() noexcept = default; - template - explicit cuda_pinned_allocator(cuda_pinned_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - cudaError_t error = - cudaMallocHost(reinterpret_cast(&data), n * sizeof(T)); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_pinned_allocator failed due to cudaMallocHost failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - cudaError_t error = cudaFreeHost(p); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_pinned_allocator failed due to cudaFreeHost failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; - -template -constexpr bool operator==(cuda_pinned_allocator const &, - cuda_pinned_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(cuda_pinned_allocator const &, - cuda_pinned_allocator const &) noexcept { - return false; -} - -/// Underlying allocator for CUDA device memory -template struct cuda_device_allocator { - using value_type = T; - cuda_device_allocator() noexcept = default; - template - explicit cuda_device_allocator(cuda_device_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - cudaError_t error = cudaMalloc(&data, n * sizeof(T)); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_device_allocator failed due to cudaMalloc failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - cudaError_t error = cudaFree(p); - if (error != cudaSuccess) { - std::string msg = - std::string( - "cuda_device_allocator failed due to cudaFree failure : ") + - std::string(cudaGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(cuda_device_allocator const &, - cuda_device_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(cuda_device_allocator const &, - cuda_device_allocator const &) noexcept { - return false; -} -} // end namespace detail - - +// Tell cppuddle how to select the device for the cuda allocators namespace device_selection { /// GPU device selector using the CUDA API for pinned host allocations template diff --git a/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp new file mode 100644 index 00000000..ab1f8681 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/cuda_underlying_allocators.hpp @@ -0,0 +1,101 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_UNDERLYING_ALLOCATORS_HPP +#define CUDA_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for CUDA pinned memory +template struct cuda_pinned_allocator { + using value_type = T; + cuda_pinned_allocator() noexcept = default; + template + explicit cuda_pinned_allocator(cuda_pinned_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + cudaError_t error = + cudaMallocHost(reinterpret_cast(&data), n * sizeof(T)); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_pinned_allocator failed due to cudaMallocHost failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + cudaError_t error = cudaFreeHost(p); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_pinned_allocator failed due to cudaFreeHost failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; + +template +constexpr bool operator==(cuda_pinned_allocator const &, + cuda_pinned_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(cuda_pinned_allocator const &, + cuda_pinned_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for CUDA device memory +template struct cuda_device_allocator { + using value_type = T; + cuda_device_allocator() noexcept = default; + template + explicit cuda_device_allocator(cuda_device_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + cudaError_t error = cudaMalloc(&data, n * sizeof(T)); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_device_allocator failed due to cudaMalloc failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + cudaError_t error = cudaFree(p); + if (error != cudaSuccess) { + std::string msg = + std::string( + "cuda_device_allocator failed due to cudaFree failure : ") + + std::string(cudaGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(cuda_device_allocator const &, + cuda_device_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(cuda_device_allocator const &, + cuda_device_allocator const &) noexcept { + return false; +} +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp new file mode 100644 index 00000000..6668feaf --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp @@ -0,0 +1,107 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_UNDERLYING_ALLOCATORS_HPP +#define HIP_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for HIP pinned memory +template struct hip_pinned_allocator { + using value_type = T; + hip_pinned_allocator() noexcept = default; + template + explicit hip_pinned_allocator(hip_pinned_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + // hipError_t error = + // hipMallocHost(reinterpret_cast(&data), n * sizeof(T)); + + // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc + // (not hipMallocHost) for async memcpys + // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync + hipError_t error = + hipHostMalloc(reinterpret_cast(&data), n * sizeof(T)); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_pinned_allocator failed due to hipMallocHost failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + hipError_t error = hipHostFree(p); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_pinned_allocator failed due to hipFreeHost failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(hip_pinned_allocator const &, + hip_pinned_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(hip_pinned_allocator const &, + hip_pinned_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for HIP device memory +template struct hip_device_allocator { + using value_type = T; + hip_device_allocator() noexcept = default; + template + explicit hip_device_allocator(hip_device_allocator const &) noexcept {} + T *allocate(std::size_t n) { + T *data; + hipError_t error = hipMalloc(&data, n * sizeof(T)); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_device_allocator failed due to hipMalloc failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + return data; + } + void deallocate(T *p, std::size_t n) { + hipError_t error = hipFree(p); + if (error != hipSuccess) { + std::string msg = + std::string( + "hip_device_allocator failed due to hipFree failure : ") + + std::string(hipGetErrorString(error)); + throw std::runtime_error(msg); + } + } +}; +template +constexpr bool operator==(hip_device_allocator const &, + hip_device_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(hip_device_allocator const &, + hip_device_allocator const &) noexcept { + return false; +} + +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp new file mode 100644 index 00000000..1597eee7 --- /dev/null +++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp @@ -0,0 +1,74 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SYCL_UNDERLYING_ALLOCATORS_HPP +#define SYCL_UNDERLYING_ALLOCATORS_HPP + +#include +#include +#include + +namespace cppuddle { +namespace memory_recycling { +namespace detail { +/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{}) +template struct sycl_host_default_allocator { + using value_type = T; + sycl_host_default_allocator() noexcept = default; + template + explicit sycl_host_default_allocator(sycl_host_default_allocator const &) noexcept {} + T *allocate(std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + T *data = cl::sycl::malloc_host(n, default_queue); + return data; + } + void deallocate(T *p, std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + cl::sycl::free(p, default_queue); + } +}; +template +constexpr bool operator==(sycl_host_default_allocator const &, + sycl_host_default_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(sycl_host_default_allocator const &, + sycl_host_default_allocator const &) noexcept { + return false; +} + +/// Underlying allocator for SYCL device memory (using the sycl::default_selector{}) +template struct sycl_device_default_allocator { + using value_type = T; + sycl_device_default_allocator() noexcept = default; + template + explicit sycl_device_default_allocator(sycl_device_default_allocator const &) noexcept {} + T *allocate(std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + T *data = cl::sycl::malloc_device(n, default_queue); + return data; + } + void deallocate(T *p, std::size_t n) { + static cl::sycl::queue default_queue(cl::sycl::default_selector{}); + cl::sycl::free(p, default_queue); + } +}; +template +constexpr bool operator==(sycl_device_default_allocator const &, + sycl_device_default_allocator const &) noexcept { + return true; +} +template +constexpr bool operator!=(sycl_device_default_allocator const &, + sycl_device_default_allocator const &) noexcept { + return false; +} + +} // end namespace detail +} // namespace memory_recycling +} // end namespace cppuddle + +#endif diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp index 36432820..e506ee2c 100644 --- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp @@ -6,106 +6,14 @@ #ifndef HIP_RECYCLING_ALLOCATORS_HPP #define HIP_RECYCLING_ALLOCATORS_HPP -#include -#include -#include - #include "buffer_management_interface.hpp" +// import hip_pinned_allocator and hip_device_allocator +#include "detail/hip_underlying_allocators.hpp" namespace cppuddle { namespace memory_recycling { -namespace detail { -/// Underlying host allocator for HIP pinned memory -template struct hip_pinned_allocator { - using value_type = T; - hip_pinned_allocator() noexcept = default; - template - explicit hip_pinned_allocator(hip_pinned_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - // hipError_t error = - // hipMallocHost(reinterpret_cast(&data), n * sizeof(T)); - - // Even though marked as deprecated, the HIP docs recommend using hipHostMalloc - // (not hipMallocHost) for async memcpys - // https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemcpyasync - hipError_t error = - hipHostMalloc(reinterpret_cast(&data), n * sizeof(T)); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_pinned_allocator failed due to hipMallocHost failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - hipError_t error = hipHostFree(p); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_pinned_allocator failed due to hipFreeHost failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(hip_pinned_allocator const &, - hip_pinned_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(hip_pinned_allocator const &, - hip_pinned_allocator const &) noexcept { - return false; -} - -/// Underlying allocator for HIP device memory -template struct hip_device_allocator { - using value_type = T; - hip_device_allocator() noexcept = default; - template - explicit hip_device_allocator(hip_device_allocator const &) noexcept {} - T *allocate(std::size_t n) { - T *data; - hipError_t error = hipMalloc(&data, n * sizeof(T)); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_device_allocator failed due to hipMalloc failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - return data; - } - void deallocate(T *p, std::size_t n) { - hipError_t error = hipFree(p); - if (error != hipSuccess) { - std::string msg = - std::string( - "hip_device_allocator failed due to hipFree failure : ") + - std::string(hipGetErrorString(error)); - throw std::runtime_error(msg); - } - } -}; -template -constexpr bool operator==(hip_device_allocator const &, - hip_device_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(hip_device_allocator const &, - hip_device_allocator const &) noexcept { - return false; -} - -} // end namespace detail - - +// Tell cppuddle how to select the device for the hip allocators namespace device_selection { /// GPU device selector using the HIP API for pinned host allocations template diff --git a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp index 7ea9999c..fd494bca 100644 --- a/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/sycl_recycling_allocators.hpp @@ -6,77 +6,17 @@ #ifndef SYCL_RECYCLING_ALLOCATORS_HPP #define SYCL_RECYCLING_ALLOCATORS_HPP -#include -#include -#include - #include "buffer_management_interface.hpp" +#include "detail/sycl_underlying_allocators.hpp" namespace cppuddle { namespace memory_recycling { namespace device_selection { // No MutliGPU support yet, hence no select_device_function required -static_assert(max_number_gpus == 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); +static_assert(max_number_gpus <= 1, "CPPuddle currently does not support MultiGPU SYCL builds!"); } // namespace device_selection -namespace detail { -/// Underlying host allocator for SYCL pinned memory (using the sycl::default_selector{}) -template struct sycl_host_default_allocator { - using value_type = T; - sycl_host_default_allocator() noexcept = default; - template - explicit sycl_host_default_allocator(sycl_host_default_allocator const &) noexcept {} - T *allocate(std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - T *data = cl::sycl::malloc_host(n, default_queue); - return data; - } - void deallocate(T *p, std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - cl::sycl::free(p, default_queue); - } -}; -template -constexpr bool operator==(sycl_host_default_allocator const &, - sycl_host_default_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(sycl_host_default_allocator const &, - sycl_host_default_allocator const &) noexcept { - return false; -} - -/// Underlying allocator for SYCL device memory (using the sycl::default_selector{}) -template struct sycl_device_default_allocator { - using value_type = T; - sycl_device_default_allocator() noexcept = default; - template - explicit sycl_device_default_allocator(sycl_device_default_allocator const &) noexcept {} - T *allocate(std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - T *data = cl::sycl::malloc_device(n, default_queue); - return data; - } - void deallocate(T *p, std::size_t n) { - static cl::sycl::queue default_queue(cl::sycl::default_selector{}); - cl::sycl::free(p, default_queue); - } -}; -template -constexpr bool operator==(sycl_device_default_allocator const &, - sycl_device_default_allocator const &) noexcept { - return true; -} -template -constexpr bool operator!=(sycl_device_default_allocator const &, - sycl_device_default_allocator const &) noexcept { - return false; -} - -} // end namespace detail - /// Recycling allocator for SYCL pinned host memory (default device) template ::value, int> = 0> using recycle_allocator_sycl_host = From a3fdeeddfdffde7e58137177dd6022f39df0b824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 01:21:21 +0100 Subject: [PATCH 13/19] Adapt aggregation tests to interface changes --- tests/work_aggregation_cpu_triad.cpp | 1 - tests/work_aggregation_cuda_triad.cpp | 46 ++++++++++++++------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp index 7bb455b0..98159ea0 100644 --- a/tests/work_aggregation_cpu_triad.cpp +++ b/tests/work_aggregation_cpu_triad.cpp @@ -5,7 +5,6 @@ #include #undef NDEBUG -#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" #include diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index f3f6ec92..63596423 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -7,11 +7,11 @@ //#undef NDEBUG #include -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" - #include +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" + //=============================================================================== @@ -19,12 +19,14 @@ // Stream benchmark template -__global__ void __launch_bounds__(1024, 2) triad_kernel(float_t *A, const float_t *B, const float_t *C, const float_t scalar, const size_t start_id, const size_t kernel_size, const size_t problem_size) { +__global__ void __launch_bounds__(1024, 2) + triad_kernel(float_t *A, const float_t *B, const float_t *C, + const float_t scalar, const size_t start_id, + const size_t kernel_size, const size_t problem_size) { const size_t i = start_id + blockIdx.x * blockDim.x + threadIdx.x; A[i] = B[i] + scalar * C[i]; } - //=============================================================================== //=============================================================================== int hpx_main(int argc, char *argv[]) { @@ -37,7 +39,8 @@ int hpx_main(int argc, char *argv[]) { size_t number_underlying_executors{0}; bool print_launch_counter{false}; std::string executor_type_string{}; - Aggregated_Executor_Modes executor_mode{Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor_modes executor_mode{ + cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::string filename{}; { try { @@ -97,11 +100,11 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } if (executor_type_string == "EAGER") { - executor_mode = Aggregated_Executor_Modes::EAGER; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER; } else if (executor_type_string == "STRICT") { - executor_mode = Aggregated_Executor_Modes::STRICT; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT; } else if (executor_type_string == "ENDLESS") { - executor_mode = Aggregated_Executor_Modes::ENDLESS; + executor_mode = cppuddle::kernel_aggregation::aggregated_executor_modes::ENDLESS; } else { std::cerr << "ERROR: Unknown executor mode " << executor_type_string << "\n Valid choices are: EAGER,STRICT,ENDLESS" << std::endl; @@ -122,7 +125,7 @@ int hpx_main(int argc, char *argv[]) { stream_pool::init>( number_underlying_executors, 0, true); static const char kernelname2[] = "cuda_triad"; - using executor_pool = aggregation_pool>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); @@ -147,9 +150,9 @@ int hpx_main(int argc, char *argv[]) { std::vector A(problem_size, 0.0); std::vector B(problem_size, 2.0); std::vector C(problem_size, 1.0); - recycler::cuda_device_buffer device_A(problem_size, 0); - recycler::cuda_device_buffer device_B(problem_size, 0); - recycler::cuda_device_buffer device_C(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_A(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_B(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_C(problem_size, 0); cudaMemcpy(device_A.device_side_buffer, A.data(), problem_size * sizeof(float_t), cudaMemcpyHostToDevice); cudaMemcpy(device_B.device_side_buffer, B.data(), @@ -196,17 +199,16 @@ int hpx_main(int argc, char *argv[]) { auto slice_exec = fut.get(); auto alloc_host = slice_exec.template make_allocator< - float_t, recycler::detail::cuda_pinned_allocator>(); + float_t, cppuddle::memory_recycling::detail::cuda_pinned_allocator>(); auto alloc_device = slice_exec.template make_allocator< - float_t, recycler::detail::cuda_device_allocator>(); + float_t, cppuddle::memory_recycling::detail::cuda_device_allocator>(); // Start the actual task - // todo -- one slice gets a buffer that's not vaild anymore std::vector local_A( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_A(slice_exec.number_slices * kernel_size, alloc_device); @@ -214,7 +216,7 @@ int hpx_main(int argc, char *argv[]) { std::vector local_B( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_B(slice_exec.number_slices * kernel_size, alloc_device); @@ -222,7 +224,7 @@ int hpx_main(int argc, char *argv[]) { std::vector local_C( slice_exec.number_slices * kernel_size, float_t{}, alloc_host); - recycler::cuda_aggregated_device_buffer device_C(slice_exec.number_slices * kernel_size, alloc_device); @@ -317,9 +319,9 @@ int hpx_main(int argc, char *argv[]) { std::vector A(problem_size, 0.0); std::vector B(problem_size, 2.0); std::vector C(problem_size, 1.0); - recycler::cuda_device_buffer device_A(problem_size, 0); - recycler::cuda_device_buffer device_B(problem_size, 0); - recycler::cuda_device_buffer device_C(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_A(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_B(problem_size, 0); + cppuddle::memory_recycling::cuda_device_buffer device_C(problem_size, 0); cudaMemcpy(device_A.device_side_buffer, A.data(), problem_size * sizeof(float_t), cudaMemcpyHostToDevice); cudaMemcpy(device_B.device_side_buffer, B.data(), From 7d8e428a59ec8545f451b19d072cf1739c64ca45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 02:19:47 +0100 Subject: [PATCH 14/19] Separate hip/cuda buffer into their own headers Also contains a fix for the aggregation failure test (just re-enabling the test by defining DEBUG_AGGREGATION_CALLS within the test itself) --- .../aggregation_executors_and_allocators.hpp | 4 ++ .../cuda_recycling_allocators.hpp | 48 -------------- .../hip_recycling_allocators.hpp | 48 -------------- .../util/cuda_recycling_device_buffer.hpp | 66 +++++++++++++++++++ .../util/hip_recycling_device_buffer.hpp | 65 ++++++++++++++++++ .../{ => util}/recycling_kokkos_view.hpp | 2 +- include/cuda_buffer_util.hpp | 1 + include/hip_buffer_util.hpp | 3 +- include/kokkos_buffer_util.hpp | 2 +- ...llocator_kokkos_executor_for_loop_test.cpp | 2 +- tests/allocator_kokkos_test.cpp | 2 +- tests/stream_test.hpp | 1 + tests/work_aggregation_cuda_triad.cpp | 3 +- tests/work_aggregation_test.cpp | 46 +++++++------ 14 files changed, 171 insertions(+), 122 deletions(-) create mode 100644 include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp create mode 100644 include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp rename include/cppuddle/memory_recycling/{ => util}/recycling_kokkos_view.hpp (98%) diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp index 5826c2c3..43f3c681 100644 --- a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp @@ -11,6 +11,10 @@ #endif #include +// When defined, CPPuddle will run more checks +// about the order of aggregated method calls. +// Best defined before including this header when needed +// (hence commented out here) //#define DEBUG_AGGREGATION_CALLS 1 #include diff --git a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp index 7297955f..b47a4fe2 100644 --- a/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/cuda_recycling_allocators.hpp @@ -36,54 +36,6 @@ template ::value, int> = 0> using recycle_allocator_cuda_device = detail::recycle_allocator>; -/// RAII wrapper for CUDA device memory -template ::value, int> = 0> -struct cuda_device_buffer { - recycle_allocator_cuda_device allocator; - T *device_side_buffer; - size_t number_of_elements; - - cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0) - : allocator{device_id}, number_of_elements(number_of_elements) { - assert(device_id < max_number_gpus); - device_side_buffer = - allocator.allocate(number_of_elements); - } - ~cuda_device_buffer() { - allocator.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - cuda_device_buffer(cuda_device_buffer const &other) = delete; - cuda_device_buffer operator=(cuda_device_buffer const &other) = delete; - cuda_device_buffer(cuda_device_buffer const &&other) = delete; - cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete; - -}; - -/// RAII wrapper for CUDA device memory using a passed aggregated allocator -template ::value, int> = 0> -struct cuda_aggregated_device_buffer { - T *device_side_buffer; - size_t number_of_elements; - cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) - : number_of_elements(number_of_elements), alloc(alloc) { - device_side_buffer = - alloc.allocate(number_of_elements); - } - ~cuda_aggregated_device_buffer() { - alloc.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete; - cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete; - cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete; - cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete; - -private: - Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence - // for the entire lifetime of this buffer -}; - } // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp index e506ee2c..13b5241b 100644 --- a/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp +++ b/include/cppuddle/memory_recycling/hip_recycling_allocators.hpp @@ -36,54 +36,6 @@ template ::value, int> = 0> using recycle_allocator_hip_device = detail::recycle_allocator>; -/// RAII wrapper for HIP device memory -template ::value, int> = 0> -struct hip_device_buffer { - recycle_allocator_hip_device allocator; - T *device_side_buffer; - size_t number_of_elements; - - hip_device_buffer(size_t number_of_elements, size_t device_id) - : allocator{device_id}, number_of_elements(number_of_elements) { - assert(device_id < max_number_gpus); - device_side_buffer = - allocator.allocate(number_of_elements); - } - ~hip_device_buffer() { - allocator.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - hip_device_buffer(hip_device_buffer const &other) = delete; - hip_device_buffer operator=(hip_device_buffer const &other) = delete; - hip_device_buffer(hip_device_buffer const &&other) = delete; - hip_device_buffer operator=(hip_device_buffer const &&other) = delete; - -}; - -/// RAII wrapper for CUDA device memory using a passed aggregated allocator -template ::value, int> = 0> -struct hip_aggregated_device_buffer { - T *device_side_buffer; - size_t number_of_elements; - hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) - : number_of_elements(number_of_elements), alloc(alloc) { - device_side_buffer = - alloc.allocate(number_of_elements); - } - ~hip_aggregated_device_buffer() { - alloc.deallocate(device_side_buffer, number_of_elements); - } - // not yet implemented - hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete; - hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete; - hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete; - hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete; - -private: - Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence - // for the entire lifetime of this buffer -}; - } // namespace memory_recycling } // end namespace cppuddle #endif diff --git a/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp new file mode 100644 index 00000000..dbd7e4c8 --- /dev/null +++ b/include/cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp @@ -0,0 +1,66 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef CUDA_RECYCLING_BUFFER_HPP +#define CUDA_RECYCLING_BUFFER_HPP + +// import recycle_allocator_cuda_device +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp"" + +namespace cppuddle { +namespace memory_recycling { + + +/// RAII wrapper for CUDA device memory +template ::value, int> = 0> +struct cuda_device_buffer { + recycle_allocator_cuda_device allocator; + T *device_side_buffer; + size_t number_of_elements; + + cuda_device_buffer(const size_t number_of_elements, const size_t device_id = 0) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); + device_side_buffer = + allocator.allocate(number_of_elements); + } + ~cuda_device_buffer() { + allocator.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + cuda_device_buffer(cuda_device_buffer const &other) = delete; + cuda_device_buffer operator=(cuda_device_buffer const &other) = delete; + cuda_device_buffer(cuda_device_buffer const &&other) = delete; + cuda_device_buffer operator=(cuda_device_buffer const &&other) = delete; + +}; + +/// RAII wrapper for CUDA device memory using a passed aggregated allocator +template ::value, int> = 0> +struct cuda_aggregated_device_buffer { + T *device_side_buffer; + size_t number_of_elements; + cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } + ~cuda_aggregated_device_buffer() { + alloc.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &other) = delete; + cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &other) = delete; + cuda_aggregated_device_buffer(cuda_aggregated_device_buffer const &&other) = delete; + cuda_aggregated_device_buffer operator=(cuda_aggregated_device_buffer const &&other) = delete; + +private: + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer +}; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp new file mode 100644 index 00000000..7f04e3f7 --- /dev/null +++ b/include/cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp @@ -0,0 +1,65 @@ +// Copyright (c) 2020-2024 Gregor Daiß +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef HIP_RECYCLING_BUFFER_HPP +#define HIP_RECYCLING_BUFFER_HPP + +// import recycle_allocator_hip_device +#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp" + +namespace cppuddle { +namespace memory_recycling { + +/// RAII wrapper for HIP device memory +template ::value, int> = 0> +struct hip_device_buffer { + recycle_allocator_hip_device allocator; + T *device_side_buffer; + size_t number_of_elements; + + hip_device_buffer(size_t number_of_elements, size_t device_id) + : allocator{device_id}, number_of_elements(number_of_elements) { + assert(device_id < max_number_gpus); + device_side_buffer = + allocator.allocate(number_of_elements); + } + ~hip_device_buffer() { + allocator.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + hip_device_buffer(hip_device_buffer const &other) = delete; + hip_device_buffer operator=(hip_device_buffer const &other) = delete; + hip_device_buffer(hip_device_buffer const &&other) = delete; + hip_device_buffer operator=(hip_device_buffer const &&other) = delete; + +}; + +/// RAII wrapper for CUDA device memory using a passed aggregated allocator +template ::value, int> = 0> +struct hip_aggregated_device_buffer { + T *device_side_buffer; + size_t number_of_elements; + hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } + ~hip_aggregated_device_buffer() { + alloc.deallocate(device_side_buffer, number_of_elements); + } + // not yet implemented + hip_aggregated_device_buffer(hip_aggregated_device_buffer const &other) = delete; + hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &other) = delete; + hip_aggregated_device_buffer(hip_aggregated_device_buffer const &&other) = delete; + hip_aggregated_device_buffer operator=(hip_aggregated_device_buffer const &&other) = delete; + +private: + Host_Allocator &alloc; // will stay valid for the entire aggregation region and hence + // for the entire lifetime of this buffer +}; + +} // namespace memory_recycling +} // end namespace cppuddle +#endif diff --git a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp similarity index 98% rename from include/cppuddle/memory_recycling/recycling_kokkos_view.hpp rename to include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp index 98ce2799..1f0ed950 100644 --- a/include/cppuddle/memory_recycling/recycling_kokkos_view.hpp +++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp @@ -9,7 +9,7 @@ #include #include -#include "buffer_management_interface.hpp" +#include "cppuddle/memory_recycling/buffer_management_interface.hpp" namespace cppuddle { diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 8d004bef..7aa44c9a 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -8,6 +8,7 @@ #include "buffer_manager.hpp" #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" namespace recycler { namespace detail { diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 3f0b3034..dfd31cdc 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -6,7 +6,8 @@ #ifndef HIP_BUFFER_UTIL_HPP #define HIP_BUFFER_UTIL_HPP -#include "/cppuddle/memory_recycling/hip_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/hip_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/hip_recycling_device_buffer.hpp" namespace recycler { diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 54736ebe..66d1f8c4 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -5,7 +5,7 @@ #ifndef KOKKOS_BUFFER_UTIL_HPP #define KOKKOS_BUFFER_UTIL_HPP -#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" namespace recycler { diff --git a/tests/allocator_kokkos_executor_for_loop_test.cpp b/tests/allocator_kokkos_executor_for_loop_test.cpp index c38294d7..ad184ff5 100644 --- a/tests/allocator_kokkos_executor_for_loop_test.cpp +++ b/tests/allocator_kokkos_executor_for_loop_test.cpp @@ -23,7 +23,7 @@ #include "cppuddle/memory_recycling/std_recycling_allocators.hpp" #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" -#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" // Assert during Release builds as well for this file: #undef NDEBUG diff --git a/tests/allocator_kokkos_test.cpp b/tests/allocator_kokkos_test.cpp index 5fb780e5..e231b557 100644 --- a/tests/allocator_kokkos_test.cpp +++ b/tests/allocator_kokkos_test.cpp @@ -23,7 +23,7 @@ #include "cppuddle/memory_recycling/std_recycling_allocators.hpp" #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" -#include "cppuddle/memory_recycling/recycling_kokkos_view.hpp" +#include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" using kokkos_array = Kokkos::View; diff --git a/tests/stream_test.hpp b/tests/stream_test.hpp index b793fe9c..63f25b27 100644 --- a/tests/stream_test.hpp +++ b/tests/stream_test.hpp @@ -10,6 +10,7 @@ #include #include #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" #include "cppuddle/executor_recycling/executor_pools_interface.hpp"" template diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index 63596423..f04e04a4 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -10,10 +10,9 @@ #include #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" - - //=============================================================================== //=============================================================================== // Stream benchmark diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp index 25455633..7f5664f5 100644 --- a/tests/work_aggregation_test.cpp +++ b/tests/work_aggregation_test.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022-2022 Gregor Daiß +// Copyright (c) 2022-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -9,11 +9,15 @@ #include #include #include -#include "../include/aggregation_manager.hpp" -#include "../include/cuda_buffer_util.hpp" #include +#include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" +#include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" +#define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are + // compatible across all participating tasks + // Must be defined before including the aggregation: +#include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" //=============================================================================== //=============================================================================== @@ -114,9 +118,9 @@ namespace hpx { namespace parallel { namespace execution { void sequential_test(void) { static const char kernelname[] = "kernel1"; - using kernel_pool1 = aggregation_pool>; - kernel_pool1::init(8, 2, Aggregated_Executor_Modes::STRICT); + kernel_pool1::init(8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); // Sequential test hpx::cout << "Sequential test with all executor slices" << std::endl; hpx::cout << "----------------------------------------" << std::endl; @@ -260,8 +264,8 @@ void interruption_test(void) { hpx::cout << "Sequential test with interruption:" << std::endl; hpx::cout << "----------------------------------" << std::endl; { - Aggregated_Executor agg_exec{ - 4, Aggregated_Executor_Modes::EAGER}; + cppuddle::kernel_aggregation::aggregated_executor agg_exec{ + 4, cppuddle::kernel_aggregation::aggregated_executor_modes::EAGER}; std::vector> slices_done_futs; auto slice_fut1 = agg_exec.request_executor_slice(); @@ -326,8 +330,8 @@ void failure_test(bool type_error) { hpx::cout << "------------------------------------------------------" << std::endl; { - Aggregated_Executor agg_exec{ - 4, Aggregated_Executor_Modes::STRICT}; + cppuddle::kernel_aggregation::aggregated_executor agg_exec{ + 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT}; auto slice_fut1 = agg_exec.request_executor_slice(); @@ -405,9 +409,10 @@ void pointer_add_test(void) { hpx::cout << "--------------------------------------------------------" << std::endl; static const char kernelname2[] = "kernel2"; - using kernel_pool2 = aggregation_pool>; - kernel_pool2::init(8, 2, Aggregated_Executor_Modes::STRICT); + using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool< + kernelname2, Dummy_Executor, round_robin_pool>; + kernel_pool2::init( + 8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); { std::vector erg(512); std::vector> slices_done_futs; @@ -602,10 +607,11 @@ void references_add_test(void) { { /*Aggregated_Executor agg_exec{ 4, Aggregated_Executor_Modes::STRICT};*/ - auto &agg_exec = - std::get<0>(stream_pool::get_interface< - Aggregated_Executor, - round_robin_pool>>(0)); + auto &agg_exec = std::get<0>( + stream_pool::get_interface< + cppuddle::kernel_aggregation::aggregated_executor, + round_robin_pool>>(0)); std::vector erg(512); std::vector> slices_done_futs; @@ -831,9 +837,11 @@ int hpx_main(int argc, char *argv[]) { 8, 0, false); stream_pool::init>(8); - stream_pool::init, - round_robin_pool>>( - 8, 4, Aggregated_Executor_Modes::STRICT); + stream_pool::init< + cppuddle::kernel_aggregation::aggregated_executor, + round_robin_pool< + cppuddle::kernel_aggregation::aggregated_executor>>( + 8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); /*hpx::cuda::experimental::cuda_executor executor1 = std::get<0>(stream_pool::get_interface< hpx::cuda::experimental::cuda_executor, From 72b486c3430035e9b05502d6dddc9e5ffb4f30e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 02:55:28 +0100 Subject: [PATCH 15/19] Clean legacy calls from aggregation code --- .../aggregation_executors_and_allocators.hpp | 25 ++++++++---- tests/work_aggregation_cpu_triad.cpp | 12 ++++-- tests/work_aggregation_cuda_triad.cpp | 11 ++++-- tests/work_aggregation_test.cpp | 38 +++++++++++-------- 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp index 43f3c681..dfc76622 100644 --- a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp +++ b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp @@ -52,9 +52,11 @@ #include #include -#include "../include/buffer_manager.hpp" -#include "../include/stream_manager.hpp" #include "cppuddle/common/config.hpp" +// get direct access to the buffer manangment +#include "cppuddle/memory_recycling/detail/buffer_management.hpp" +// get normal access to the executor pools +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" #ifndef CPPUDDLE_HAVE_HPX_MUTEX #pragma message \ @@ -406,7 +408,9 @@ template class aggregated_executor { /// Wrapper to the executor interface from the stream pool /// Automatically hooks into the stream_pools reference counting /// for cpu/gpu load balancing - std::unique_ptr>> executor_wrapper; + std::unique_ptr>> + executor_wrapper; public: size_t gpu_id; @@ -849,9 +853,14 @@ template class aggregated_executor { if (local_slice_id == 1) { // Redraw executor assert(!executor_wrapper); - stream_pool::select_device>(gpu_id); + cppuddle::executor_recycling::executor_pool::select_device< + Executor, cppuddle::executor_recycling::round_robin_pool_impl>( + gpu_id); executor_wrapper.reset( - new stream_interface>(gpu_id)); + new cppuddle::executor_recycling::executor_interface< + Executor, + cppuddle::executor_recycling::round_robin_pool_impl>( + gpu_id)); // Renew promise that all slices will be ready as the primary launch // criteria... hpx::lcos::shared_future fut; @@ -860,8 +869,10 @@ template class aggregated_executor { // Fallback launch condidtion: Launch as soon as the underlying stream // is ready /* auto slices_full_fut = slices_full_promise.get_future(); */ - stream_pool::select_device>(gpu_id); - auto exec_fut = (*executor_wrapper).get_future(); + cppuddle::executor_recycling::executor_pool::select_device< + Executor, + cppuddle::executor_recycling::round_robin_pool_impl>(gpu_id); + auto exec_fut = (*executor_wrapper).get_future(); /* auto fut = hpx::when_any(exec_fut, slices_full_fut); */ fut = std::move(exec_fut); } else { diff --git a/tests/work_aggregation_cpu_triad.cpp b/tests/work_aggregation_cpu_triad.cpp index 98159ea0..d65c9668 100644 --- a/tests/work_aggregation_cpu_triad.cpp +++ b/tests/work_aggregation_cpu_triad.cpp @@ -5,6 +5,7 @@ #include #undef NDEBUG +#include "cppuddle/memory_recycling/std_recycling_allocators.hpp" #include "cppuddle/kernel_aggregation/kernel_aggregation_interface.hpp" #include @@ -179,11 +180,14 @@ int hpx_main(int argc, char *argv[]) { } } - stream_pool::init>( + cppuddle::executor_recycling::executor_pool::init< + Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>( number_underlying_executors); static const char kernelname[] = "cpu_triad"; - using executor_pool = cppuddle::kernel_aggregation::aggregation_pool>; + using executor_pool = cppuddle::kernel_aggregation::aggregation_pool< + kernelname, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); using float_t = float; @@ -289,7 +293,7 @@ int hpx_main(int argc, char *argv[]) { std::flush(hpx::cout); sleep(1); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } diff --git a/tests/work_aggregation_cuda_triad.cpp b/tests/work_aggregation_cuda_triad.cpp index f04e04a4..75f7ad14 100644 --- a/tests/work_aggregation_cuda_triad.cpp +++ b/tests/work_aggregation_cuda_triad.cpp @@ -121,11 +121,14 @@ int hpx_main(int argc, char *argv[]) { hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0)); using executor_t = hpx::cuda::experimental::cuda_executor; - stream_pool::init>( + cppuddle::executor_recycling::executor_pool::init< + executor_t, + cppuddle::executor_recycling::round_robin_pool_impl>( number_underlying_executors, 0, true); static const char kernelname2[] = "cuda_triad"; - using executor_pool = cppuddle::kernel_aggregation::aggregation_pool>; + using executor_pool = cppuddle::kernel_aggregation::aggregation_pool< + kernelname2, executor_t, + cppuddle::executor_recycling::round_robin_pool_impl>; executor_pool::init(number_aggregation_executors, max_slices, executor_mode); using float_t = float; @@ -418,7 +421,7 @@ int hpx_main(int argc, char *argv[]) { /* sleep(1); */ hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } diff --git a/tests/work_aggregation_test.cpp b/tests/work_aggregation_test.cpp index 7f5664f5..abe827f4 100644 --- a/tests/work_aggregation_test.cpp +++ b/tests/work_aggregation_test.cpp @@ -14,6 +14,7 @@ #include "cppuddle/memory_recycling/cuda_recycling_allocators.hpp" #include "cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp" +#include "cppuddle/executor_recycling/executor_pools_interface.hpp"" #define DEBUG_AGGREGATION_CALLS 1 // enables checks if aggregated function calls are // compatible across all participating tasks // Must be defined before including the aggregation: @@ -118,9 +119,11 @@ namespace hpx { namespace parallel { namespace execution { void sequential_test(void) { static const char kernelname[] = "kernel1"; - using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool>; - kernel_pool1::init(8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); + using kernel_pool1 = cppuddle::kernel_aggregation::aggregation_pool< + kernelname, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; + kernel_pool1::init( + 8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); // Sequential test hpx::cout << "Sequential test with all executor slices" << std::endl; hpx::cout << "----------------------------------------" << std::endl; @@ -410,7 +413,8 @@ void pointer_add_test(void) { << std::endl; static const char kernelname2[] = "kernel2"; using kernel_pool2 = cppuddle::kernel_aggregation::aggregation_pool< - kernelname2, Dummy_Executor, round_robin_pool>; + kernelname2, Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>; kernel_pool2::init( 8, 2, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); { @@ -608,10 +612,11 @@ void references_add_test(void) { /*Aggregated_Executor agg_exec{ 4, Aggregated_Executor_Modes::STRICT};*/ auto &agg_exec = std::get<0>( - stream_pool::get_interface< + cppuddle::executor_recycling::executor_pool::get_interface< cppuddle::kernel_aggregation::aggregated_executor, - round_robin_pool>>(0)); + cppuddle::executor_recycling::round_robin_pool_impl< + cppuddle::kernel_aggregation::aggregated_executor< + Dummy_Executor>>>(0)); std::vector erg(512); std::vector> slices_done_futs; @@ -832,14 +837,17 @@ int hpx_main(int argc, char *argv[]) { return hpx::finalize(); } - stream_pool::init>( - 8, 0, false); - stream_pool::init>(8); + cppuddle::executor_recycling::executor_pool::init< + hpx::cuda::experimental::cuda_executor, + cppuddle::executor_recycling::round_robin_pool_impl< + hpx::cuda::experimental::cuda_executor>>(8, 0, false); + cppuddle::executor_recycling::executor_pool::init< + Dummy_Executor, + cppuddle::executor_recycling::round_robin_pool_impl>(8); - stream_pool::init< + cppuddle::executor_recycling::executor_pool::init< cppuddle::kernel_aggregation::aggregated_executor, - round_robin_pool< + cppuddle::executor_recycling::round_robin_pool_impl< cppuddle::kernel_aggregation::aggregated_executor>>( 8, 4, cppuddle::kernel_aggregation::aggregated_executor_modes::STRICT); /*hpx::cuda::experimental::cuda_executor executor1 = @@ -871,8 +879,8 @@ int hpx_main(int argc, char *argv[]) { std::flush(hpx::cout); sleep(1); - recycler::print_performance_counters(); - recycler::force_cleanup(); // Cleanup all buffers and the managers + cppuddle::memory_recycling::print_buffer_counters(); + cppuddle::memory_recycling::force_buffer_cleanup(); // Cleanup all buffers and the managers return hpx::finalize(); } From 8ccbed09edab648059ff5d091f23f73050110032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 02:59:53 +0100 Subject: [PATCH 16/19] Add deprecation file comments --- include/aggregation_manager.hpp | 4 ++++ include/aligned_buffer_util.hpp | 4 ++++ include/buffer_manager.hpp | 4 ++++ include/cuda_buffer_util.hpp | 4 ++++ include/hip_buffer_util.hpp | 4 ++++ include/kokkos_buffer_util.hpp | 4 ++++ include/stream_manager.hpp | 4 ++++ include/sycl_buffer_util.hpp | 4 ++++ 8 files changed, 32 insertions(+) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index bb0fd83f..9b546cab 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef AGGREGATION_MANAGER_HPP #define AGGREGATION_MANAGER_HPP diff --git a/include/aligned_buffer_util.hpp b/include/aligned_buffer_util.hpp index 02a57104..64497a9d 100644 --- a/include/aligned_buffer_util.hpp +++ b/include/aligned_buffer_util.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef ALIGNED_BUFFER_UTIL_HPP #define ALIGNED_BUFFER_UTIL_HPP diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index fb253990..baf807e4 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index 7aa44c9a..7fbd07be 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef CUDA_BUFFER_UTIL_HPP #define CUDA_BUFFER_UTIL_HPP diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index dfd31cdc..720baf70 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef HIP_BUFFER_UTIL_HPP #define HIP_BUFFER_UTIL_HPP diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 66d1f8c4..e84be4b6 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef KOKKOS_BUFFER_UTIL_HPP #define KOKKOS_BUFFER_UTIL_HPP #include "cppuddle/memory_recycling/util/recycling_kokkos_view.hpp" diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 25c4a080..1e781442 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef STREAM_MANAGER_HPP #define STREAM_MANAGER_HPP diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 7ce66d93..46922d17 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -3,6 +3,10 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// DEPRECATED: Do not use this file +// Only intended to make the old interface work a bit longer. +// See deprecation warnings for the new location of the functionality + #ifndef SYCL_BUFFER_UTIL_HPP #define SYCL_BUFFER_UTIL_HPP From aefd0f6b999998f81701a366a579e4650437ad97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 03:07:02 +0100 Subject: [PATCH 17/19] Begin cmakelist cleanup --- CMakeLists.txt | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a31995f..2212d40b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,6 @@ set(CPPUDDLE_VERSION_STRING "${CPPUDDLE_VERSION_MAJOR}.${CPPUDDLE_VERSION_MINOR} # GPU-related options option(CPPUDDLE_WITH_CUDA "Enable CUDA tests/examples" OFF) -option(CPPUDDLE_WITH_MULTIGPU_SUPPORT "Enables experimental MultiGPU support" OFF) option(CPPUDDLE_WITH_KOKKOS "Enable KOKKOS tests/examples" OFF) set(CPPUDDLE_WITH_MAX_NUMBER_GPUS "1" CACHE STRING "Number of GPUs that will be used. Should match the number of GPUs used when using the maximum number of HPX worker threads. Should be 1 for non-HPX builds.") # HPX-related options @@ -151,6 +150,9 @@ endif() # Define library targets and installation # (also includes various warnings for non-optimal build configurations) +# TODO Cleanup targets: +# this is leftover from the days where cppuddle was not header-only + ## Interface targets add_library(buffer_manager INTERFACE) if (CPPUDDLE_WITH_HPX) @@ -319,9 +321,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_test.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) add_hpx_executable( @@ -331,9 +330,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_cpu_triad.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) add_hpx_executable( @@ -343,9 +339,6 @@ if (CPPUDDLE_WITH_TESTS) COMPONENT_DEPENDENCIES iostreams SOURCES tests/work_aggregation_cuda_triad.cpp - include/aggregation_manager.hpp - include/buffer_manager.hpp - include/stream_manager.hpp ) target_compile_definitions(work_aggregation_test PRIVATE CPPUDDLE_HAVE_CUDA) endif() # end WITH KOKKOS @@ -359,11 +352,6 @@ if (CPPUDDLE_WITH_TESTS) add_compile_definitions(CPPUDDLE_WITH_HPX) endif() - if (CPPUDDLE_WITH_MULTIGPU_SUPPORT) - add_compile_definitions(CPPUDDLE_HAVE_MULTIGPU) - message(WARNING, " Multi-GPU Support not yet properly tested!") - endif() - #------------------------------------------------------------------------------------------------------------ # Define actual tests (usually running the binary and checking its output for certain patterns via regex) From a60cccffac4eb0a39f9dff19292d26ecbca3b9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Sat, 9 Mar 2024 04:11:01 +0100 Subject: [PATCH 18/19] Fix compatibility layer for deprecations --- include/buffer_manager.hpp | 16 +++++++++++++++- .../detail/hip_underlying_allocators.hpp | 2 +- .../detail/sycl_underlying_allocators.hpp | 2 +- .../util/recycling_kokkos_view.hpp | 4 ++-- include/kokkos_buffer_util.hpp | 4 ++-- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index baf807e4..69020e5b 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -10,18 +10,27 @@ #ifndef BUFFER_MANAGER_HPP #define BUFFER_MANAGER_HPP +#include "cppuddle/common/config.hpp" #include "cppuddle/memory_recycling/buffer_management_interface.hpp" +#include "cppuddle/memory_recycling/detail/buffer_management.hpp" #include "cppuddle/memory_recycling/std_recycling_allocators.hpp" namespace recycler { +namespace detail { +using buffer_recycler [[deprecated( + "Use buffer_interface from header " + "cppuddle/memory_recycling/detail/buffer_management.hpp instead")]] = + cppuddle::memory_recycling::detail::buffer_interface; +} + template ::value, int> = 0> using recycle_std [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = cppuddle::memory_recycling::recycle_std; template ::value, int> = 0> -using aggressive_recycle_aligned +using aggressive_recycle_std [[deprecated("Use from header std_recycling_allocators.hpp instead")]] = cppuddle::memory_recycling::aggressive_recycle_std; @@ -41,6 +50,11 @@ inline void cleanup() { cppuddle::memory_recycling::unused_buffer_cleanup(); } [[deprecated("Use cppuddle::memory_recycling::finalize() instead")]] inline void finalize() { cppuddle::memory_recycling::finalize(); } +[[deprecated("Use cppuddle::max_number_gpus instead")]] constexpr auto max_number_gpus = + cppuddle::max_number_gpus; +[[deprecated("Use cppuddle::number_instances instead")]] constexpr auto number_instances = + cppuddle::number_instances; + } // namespace recycler #endif diff --git a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp index 6668feaf..bfd7c2e1 100644 --- a/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp +++ b/include/cppuddle/memory_recycling/detail/hip_underlying_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2024 Gregor Daiß +// Copyright (c) 2021-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp index 1597eee7..3e3c9173 100644 --- a/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp +++ b/include/cppuddle/memory_recycling/detail/sycl_underlying_allocators.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2024 Gregor Daiß +// Copyright (c) 2023-2024 Gregor Daiß // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp index 1f0ed950..b8ca526c 100644 --- a/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp +++ b/include/cppuddle/memory_recycling/util/recycling_kokkos_view.hpp @@ -3,8 +3,8 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef KOKKOS_BUFFER_UTIL_HPP -#define KOKKOS_BUFFER_UTIL_HPP +#ifndef RECYCLING_KOKKOS_VIEW_HPP +#define RECYCLING_KOKKOS_VIEW_HPP #include #include #include diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index e84be4b6..716229a0 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -17,12 +17,12 @@ template using aggregated_recycled_view [[deprecated( "Use aggregated_recycle_view from header recycling_kokkos_view.hpp " "instead")]] = - cppuddle::aggregated_recycle_view; + cppuddle::memory_recycling::aggregated_recycling_view; template using recycled_view [[deprecated( "Use recycle_view from header recycling_kokkos_view.hpp instead")]] = - cppuddle::recycle_view; + cppuddle::memory_recycling::recycling_view; } // end namespace recycler From 1e719e7d913005a95b3f8f99b93d6b6c4a6f3eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Mon, 11 Mar 2024 13:05:41 +0100 Subject: [PATCH 19/19] Fix sycl namespace --- include/sycl_buffer_util.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sycl_buffer_util.hpp b/include/sycl_buffer_util.hpp index 46922d17..4bf45b3f 100644 --- a/include/sycl_buffer_util.hpp +++ b/include/sycl_buffer_util.hpp @@ -19,24 +19,24 @@ namespace detail { template using sycl_host_default_allocator [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = - cppuddle::detail::sycl_host_default_allocator; + cppuddle::memory_recycling::detail::sycl_host_default_allocator; template using sycl_device_default_allocator [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = - cppuddle::detail::sycl_device_default_allocator; + cppuddle::memory_recycling::detail::sycl_device_default_allocator; } // end namespace detail template ::value, int> = 0> using recycle_allocator_sycl_host [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_sycl_host; + cppuddle::memory_recycling::recycle_allocator_sycl_host; template ::value, int> = 0> using recycle_allocator_sycl_device [[deprecated("Use from header sycl_recycling_allocators.hpp instead")]] = - cppuddle::recycle_allocator_sycl_device; + cppuddle::memory_recycling::recycle_allocator_sycl_device; } // end namespace recycler #endif