From fa15cbe81cc820f2887e33cb6eb1e0598d6f5134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Dai=C3=9F?= Date: Wed, 23 Aug 2023 09:28:45 -0500 Subject: [PATCH] Backport multigpu interface changes [for 0.2.1] --- CMakeLists.txt | 4 +- include/aggregation_manager.hpp | 7 +++- include/buffer_manager.hpp | 30 +++++++++++++- include/cuda_buffer_util.hpp | 20 +++------- include/hip_buffer_util.hpp | 14 +++---- include/kokkos_buffer_util.hpp | 24 ++++++++++- include/stream_manager.hpp | 71 +++++++++++++++++++++++++++++---- 7 files changed, 132 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a2337a74..6a82c539 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,7 +270,7 @@ if (CPPUDDLE_WITH_TESTS) find_program(VALGRIND_COMMAND valgrind) if (VALGRIND_COMMAND) add_test(allocator_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_test --arraysize 5000000 --passes 200) + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_test --arraysize 5000000 --passes 200) set_tests_properties(allocator_memcheck.valgrind PROPERTIES PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" ) @@ -327,7 +327,7 @@ if (CPPUDDLE_WITH_TESTS) find_program(VALGRIND_COMMAND valgrind) if (VALGRIND_COMMAND) add_test(allocator_memcheck.valgrind - ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full ./allocator_aligned_test --arraysize 5000000 --passes 200) + ${VALGRIND_COMMAND} --trace-children=yes --leak-check=full --undef-value-errors=no --show-error-list=yes ./allocator_aligned_test --arraysize 5000000 --passes 200) set_tests_properties(allocator_memcheck.valgrind PROPERTIES PASS_REGULAR_EXPRESSION "ERROR SUMMARY: 0 errors from 0 contexts" ) diff --git a/include/aggregation_manager.hpp b/include/aggregation_manager.hpp index cd0f3afb..ce30d27c 100644 --- a/include/aggregation_manager.hpp +++ b/include/aggregation_manager.hpp @@ -385,6 +385,7 @@ template class Aggregated_Executor { Executor &executor; public: + const size_t gpu_id{0}; // Subclasses /// Slice class - meant as a scope interface to the aggregated executor @@ -895,7 +896,7 @@ template class Aggregated_Executor { Aggregated_Executor(const size_t number_slices, Aggregated_Executor_Modes mode) - : max_slices(number_slices), current_slices(0), slices_exhausted(false),dealloc_counter(0), + : gpu_id(0), max_slices(number_slices), current_slices(0), slices_exhausted(false), dealloc_counter(0), mode(mode), executor_slices_alive(false), buffers_in_use(false), executor_tuple( stream_pool::get_interface>()), @@ -988,7 +989,9 @@ class aggregation_pool { /// interface template static void init(size_t number_of_executors, size_t slices_per_executor, - Aggregated_Executor_Modes mode) { + Aggregated_Executor_Modes mode, size_t num_devices = 1) { + if (num_devices > 1) + throw std::runtime_error("Got num_devices > 1. MultiGPU not yet supported in v0.2.1"); std::lock_guard guard(instance.pool_mutex); assert(instance.aggregation_executor_pool.empty()); for (int i = 0; i < number_of_executors; i++) { diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp index f7ccac7d..0cd96b6d 100644 --- a/include/buffer_manager.hpp +++ b/include/buffer_manager.hpp @@ -14,12 +14,16 @@ #include #include #include +#include #ifdef CPPUDDLE_HAVE_COUNTERS #include #endif + namespace recycler { +constexpr size_t number_instances = 1; +constexpr size_t max_number_gpus = 1; namespace detail { namespace util { @@ -51,7 +55,12 @@ class buffer_recycler { /// Returns and allocated buffer of the requested size - this may be a reused /// buffer template - static T *get(size_t number_elements, bool manage_content_lifetime = false) { + static T *get(size_t number_elements, bool manage_content_lifetime = false, + std::optional localtion_id = std::nullopt, std::optional device_id = std::nullopt) { + if (device_id) { + if (*device_id > 0) + throw std::runtime_error("Got device_id > 1. MultiGPU not yet supported in v0.2.1"); + } std::lock_guard guard(mut); if (!recycler_instance) { // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) @@ -62,13 +71,21 @@ class buffer_recycler { } /// Marks an buffer as unused and fit for reusage template - static void mark_unused(T *p, size_t number_elements) { + static void mark_unused(T *p, size_t number_elements, + std::optional localtion_id = std::nullopt, std::optional device_id = std::nullopt) { std::lock_guard guard(mut); if (recycler_instance) { // if the instance was already destroyed all // buffers are destroyed anyway return buffer_manager::mark_unused(p, number_elements); } } + + template + static void register_allocator_counters_with_hpx(void) { + std::cerr << "Warning: CPPuddle v0.2.1 does not yet support HPX counters " + "-- this operation will be ignored!" + << std::endl; + } /// Increase the reference coutner of a buffer template static void increase_usage_counter(T *p, size_t number_elements) noexcept { @@ -590,6 +607,8 @@ std::unique_ptr> template struct recycle_allocator { using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v); recycle_allocator() noexcept = default; template explicit recycle_allocator( @@ -627,6 +646,9 @@ operator!=(recycle_allocator const &, template struct aggressive_recycle_allocator { using value_type = T; + using underlying_allocator_type = Host_Allocator; + static_assert(std::is_same_v); + aggressive_recycle_allocator() noexcept = default; template explicit aggressive_recycle_allocator( @@ -675,6 +697,10 @@ using aggressive_recycle_std = /// Deletes all buffers (even ones still marked as used), delete the buffer /// managers and the recycler itself inline void force_cleanup() { detail::buffer_recycler::clean_all(); } +/// Dummy method that maps to clean_all for now - ensures interface +/// compatabilty with 0.3.0 where finalize is a smarter cleanup that ensures no +/// further buffers can be added and static buffers are properly cleaned +inline void finalize() { detail::buffer_recycler::clean_all(); } /// Deletes all buffers currently marked as unused inline void cleanup() { detail::buffer_recycler::clean_unused_buffers(); } diff --git a/include/cuda_buffer_util.hpp b/include/cuda_buffer_util.hpp index d2d0f596..4de0d323 100644 --- a/include/cuda_buffer_util.hpp +++ b/include/cuda_buffer_util.hpp @@ -160,29 +160,21 @@ struct cuda_aggregated_device_buffer { device_side_buffer = recycle_allocator_cuda_device{}.allocate(number_of_elements); } + explicit cuda_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } explicit cuda_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - cudaSetDevice(gpu_id); -#else // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice + // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice // Allows for testing without any changes to other projects assert(gpu_id == 0); -#endif device_side_buffer = alloc.allocate(number_of_elements); } ~cuda_aggregated_device_buffer() { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - if (set_id) - cudaSetDevice(gpu_id); -#else - // TODO It would be better to have separate method for this but it would change the interface - // This will have to do for some testing. If it's worth it, add separate method without cudaSetDevice - // Allows for testing without any changes to other projects - assert(gpu_id == 0); -#endif alloc.deallocate(device_side_buffer, number_of_elements); } diff --git a/include/hip_buffer_util.hpp b/include/hip_buffer_util.hpp index 87d41e3b..9dc4dbaa 100644 --- a/include/hip_buffer_util.hpp +++ b/include/hip_buffer_util.hpp @@ -155,29 +155,25 @@ struct hip_aggregated_device_buffer { device_side_buffer = recycle_allocator_hip_device{}.allocate(number_of_elements); } + explicit hip_aggregated_device_buffer(size_t number_of_elements, Host_Allocator &alloc) + : number_of_elements(number_of_elements), alloc(alloc) { + device_side_buffer = + alloc.allocate(number_of_elements); + } explicit hip_aggregated_device_buffer(size_t number_of_elements, size_t gpu_id, Host_Allocator &alloc) : gpu_id(gpu_id), number_of_elements(number_of_elements), set_id(true), alloc(alloc) { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - hipSetDevice(gpu_id); -#else // TODO It would be better to have separate method for this but it would change the interface // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice // Allows for testing without any changes to other projects assert(gpu_id == 0); -#endif device_side_buffer = alloc.allocate(number_of_elements); } ~hip_aggregated_device_buffer() { -#if defined(CPPUDDLE_HAVE_MULTIGPU) - if (set_id) - hipSetDevice(gpu_id); -#else // TODO It would be better to have separate method for this but it would change the interface // This will have to do for some testing. If it's worth it, add separate method without hipSetDevice // Allows for testing without any changes to other projects assert(gpu_id == 0); -#endif alloc.deallocate(device_side_buffer, number_of_elements); } diff --git a/include/kokkos_buffer_util.hpp b/include/kokkos_buffer_util.hpp index 7e95a9c6..fa9a3349 100644 --- a/include/kokkos_buffer_util.hpp +++ b/include/kokkos_buffer_util.hpp @@ -84,7 +84,8 @@ class recycled_view : public kokkos_type { public: using view_type = kokkos_type; - template + template = true> explicit recycled_view(Args... args) : kokkos_type( allocator.allocate(kokkos_type::required_allocation_size(args...) / @@ -93,6 +94,26 @@ class recycled_view : public kokkos_type { total_elements(kokkos_type::required_allocation_size(args...) / sizeof(element_type)) {} + template = true> + recycled_view(const size_t device_id, Args... args) + : kokkos_type( + allocator.allocate(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)), + args...), + total_elements(kokkos_type::required_allocation_size(args...) / + sizeof(element_type)) {} + + template ::value, bool> = true> + recycled_view(std::size_t device_id, layout_t layout) + : kokkos_type( + allocator.allocate(kokkos_type::required_allocation_size(layout) / + sizeof(element_type)), + layout), + total_elements(kokkos_type::required_allocation_size(layout) / + sizeof(element_type)) {} + recycled_view( const recycled_view &other) : kokkos_type(other) { @@ -101,6 +122,7 @@ class recycled_view : public kokkos_type { allocator.increase_usage_counter(this->data(), this->total_elements); } + recycled_view & operator=(const recycled_view &other) { allocator.deallocate(this->data(), total_elements); diff --git a/include/stream_manager.hpp b/include/stream_manager.hpp index 0984678d..a3206ff2 100644 --- a/include/stream_manager.hpp +++ b/include/stream_manager.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include //#include @@ -221,35 +222,86 @@ class stream_pool { stream_pool_implementation::init( number_of_streams, std::forward(executor_args)...); } + // Dummy for interface compatbility with future 0.3.0 release + // Works the same as the init method here + template + static void init_all_executor_pools(size_t number_of_streams, Ts &&... executor_args) { + std::lock_guard guard(mut); + if (!access_instance) { + // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) + access_instance.reset(new stream_pool()); + } + assert(access_instance); + stream_pool_implementation::init( + number_of_streams, std::forward(executor_args)...); + } + // Dummy for interface compatbility with future 0.3.0 release + // Works the same as the init method here + template + static void init_executor_pool(size_t device_id, size_t number_of_streams, Ts &&... executor_args) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); + std::lock_guard guard(mut); + if (!access_instance) { + // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) + access_instance.reset(new stream_pool()); + } + assert(access_instance); + stream_pool_implementation::init( + number_of_streams, std::forward(executor_args)...); + } template static void cleanup() { assert(access_instance); // should already be initialized stream_pool_implementation::cleanup(); } template - static std::tuple get_interface() { + static std::tuple get_interface(size_t device_id = 0) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); assert(access_instance); // should already be initialized return stream_pool_implementation::get_interface(); } template - static void release_interface(size_t index) noexcept { + static void release_interface(size_t index, size_t device_id = 0) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); assert(access_instance); // should already be initialized stream_pool_implementation::release_interface(index); } template - static bool interface_available(size_t load_limit) noexcept { + static bool interface_available(size_t load_limit, size_t device_id = 0) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); assert(access_instance); // should already be initialized return stream_pool_implementation::interface_available( load_limit); } template - static size_t get_current_load() noexcept { + static size_t get_current_load(size_t device_id = 0) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); assert(access_instance); // should already be initialized return stream_pool_implementation::get_current_load(); } template - static size_t get_next_device_id() noexcept { + static size_t get_next_device_id(size_t num_devices = 1) { + if (num_devices > 1) + throw std::runtime_error("Got num_devices > 1. MultiGPU not yet supported in cppuddle v0.2.1"); assert(access_instance); // should already be initialized - return stream_pool_implementation::get_next_device_id(); + return 0; + } + + template + static size_t select_device(size_t device_id = 0) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); + return 0; + } + + // Dummy for interface compatbility with future 0.3.0 release + template + static void set_device_selector(std::function select_gpu_function) { + } private: @@ -347,9 +399,12 @@ std::unique_ptr> template class stream_interface { public: - explicit stream_interface() + explicit stream_interface(size_t device_id = 0) : t(stream_pool::get_interface()), - interface(std::get<0>(t)), interface_index(std::get<1>(t)) {} + interface(std::get<0>(t)), interface_index(std::get<1>(t)) { + if (device_id > 0) + throw std::runtime_error("Got device_id > 0. MultiGPU not yet supported in cppuddle v0.2.1"); + } stream_interface(const stream_interface &other) = delete; stream_interface &operator=(const stream_interface &other) = delete;