Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move Kokkos Kernel Aggregation Utils from Octo-Tiger to CPPuddle #31

Merged
merged 5 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/buffer_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"

/// Deprectated LEGACY namespace. Kept around for compatiblity with old code for now
/// Deprecated LEGACY namespace. Kept around for compatiblity with old code for now
namespace recycler {

namespace detail {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp"

/// main CPPuddle namespace
/// Primary CPPuddle namespace containing the three primary feature modules /
/// (memory_recycling, / executor_recycling and kernel_aggregation) in
/// sub-namespaces
namespace cppuddle {
/// CPPuddle namespace containing the executor pool functionality
namespace executor_recycling {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,19 +432,26 @@ template <typename Executor> class aggregated_executor {
/// How many slices are there overall - required to check the launch
/// criteria
const size_t number_slices;
const size_t max_slices;
const size_t id;
using executor_t = Executor;
executor_slice(aggregated_executor &parent, const size_t slice_id,
const size_t number_slices)
const size_t number_slices, const size_t max_number_slices)
: parent(parent), notify_parent_about_destruction(true),
number_slices(number_slices), id(slice_id) {
}
number_slices(number_slices), id(slice_id), max_slices(max_number_slices) {
assert(parent.max_slices == max_slices);
assert(number_slices >= 1);
assert(number_slices <= max_slices);
}
~executor_slice(void) {
// Don't notify parent if we moved away from this executor_slice
if (notify_parent_about_destruction) {
// Executor should be done by the time of destruction
// -> check here before notifying parent

assert(parent.max_slices == max_slices);
assert(number_slices >= 1);
assert(number_slices <= max_slices);
// parent still in execution mode?
assert(parent.slices_exhausted == true);
// all kernel launches done?
Expand All @@ -459,7 +466,7 @@ template <typename Executor> class aggregated_executor {
: parent(other.parent), launch_counter(std::move(other.launch_counter)),
buffer_counter(std::move(other.buffer_counter)),
number_slices(std::move(other.number_slices)),
id(std::move(other.id)) {
id(std::move(other.id)), max_slices(std::move(other.max_slices)) {
other.notify_parent_about_destruction = false;
}
executor_slice &operator=(executor_slice &&other) {
Expand All @@ -468,6 +475,7 @@ template <typename Executor> class aggregated_executor {
buffer_counter = std::move(other.buffer_counter);
number_slices = std::move(other.number_slices);
id = std::move(other.id);
max_slices = std::move(other.max_slices);
other.notify_parent_about_destruction = false;
}
template <typename T, typename Host_Allocator>
Expand Down Expand Up @@ -844,7 +852,7 @@ template <typename Executor> class aggregated_executor {
} else {
launched_slices = current_slices;
ret_fut = hpx::make_ready_future(executor_slice{*this,
executor_slices.size(), launched_slices});
executor_slices.size(), launched_slices, max_slices});
}

// Are we the first slice? If yes, add continuation set the
Expand Down Expand Up @@ -888,7 +896,7 @@ template <typename Executor> class aggregated_executor {
size_t id = 0;
for (auto &slice_promise : executor_slices) {
slice_promise.set_value(
executor_slice{*this, id, launched_slices});
executor_slice{*this, id, launched_slices, max_slices});
id++;
}
executor_slices.clear();
Expand Down
197 changes: 197 additions & 0 deletions include/cppuddle/kernel_aggregation/util/kokkos_aggregation_util.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// Copyright (c) 2022-2024 Gregor Daiß
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

/// \file This file contains convenience functions for using the kernel
/// aggregation together with Kokkos kernels. It helps copying aggregated
/// views, launching aggregated Kokkos kernels and mapping aggregated views
/// to the correct subview for the current task

// I originally developed and tested these utilities within Octotiger. See:
// STEllAR-GROUP/octotiger/pull/469 and STEllAR-GROUP/octotiger/pull/487
// However, I think they are better fit for CPPuddle as they can be used
// independent of Octotiger with the work aggregation
#ifndef KOKKOS_AGGREGATION_UTIL_HPP
#define KOKKOS_AGGREGATION_UTIL_HPP
#include <hpx/futures/future.hpp>
//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#include <hpx/kokkos/executors.hpp>
#include <Kokkos_Core.hpp>
#include <hpx/kokkos.hpp>

#include <cppuddle/executor_recycling/executor_pools_interface.hpp>
#include <aggregation_manager.hpp>
#ifdef __NVCC__
#include <cuda/std/tuple>
#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
// cuda::std::tuple structured bindings are broken in CUDA < 1202
// See https://github.com/NVIDIA/libcudacxx/issues/316
// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this
// is to move tuple element and tuple size into the std namespace
// which the following snippet does. This is only necessary for old CUDA versions
// the newer ones contain a fix for this issue
namespace std {
template<size_t _Ip, class... _Tp>
struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
: _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
template <class... _Tp>
struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
: _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
}
#endif
#endif

#if defined(__CUDACC__)
#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
#elif (defined(__clang__) && defined(__HIP__)) // for HIP compilation
#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
#else
#define CPPUDDLE_HOST_DEVICE_METHOD
#endif

namespace cppuddle {
namespace kernel_aggregation {

/// Get subview for the current slice
template <typename Agg_view_t>
CPPUDDLE_HOST_DEVICE_METHOD typename Agg_view_t::view_type
get_slice_subview(const size_t slice_id, const size_t max_slices,
const Agg_view_t &agg_view) {
const size_t slice_size = agg_view.size() / max_slices;
return Kokkos::subview(
agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
(slice_id + 1) * slice_size));
}

/// Convenience function mapping aggregated Kokkos views to the current
/// exeuction slice by using subviews
template <typename Integer,
std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
typename Agg_view_t, typename... Args>
CPPUDDLE_HOST_DEVICE_METHOD auto
map_views_to_slice(const Integer slice_id, const Integer max_slices,
const Agg_view_t &current_arg, const Args &...rest) {
static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
"Argument not an aggregated view");
#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
if constexpr (sizeof...(Args) > 0) {
return cuda::std::tuple_cat(
cuda::std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(slice_id, max_slices, rest...));
} else {
return cuda::std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
#else
if constexpr (sizeof...(Args) > 0) {
return std::tuple_cat(
std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(slice_id, max_slices, rest...));
} else {
return std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
#endif
}

/// Convenience function mapping aggregated Kokkos views to the current
/// exeuction slice by using subviews
template <
typename Agg_executor_t, typename Agg_view_t,
std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
bool> = true,
typename... Args>
CPPUDDLE_HOST_DEVICE_METHOD auto
map_views_to_slice(const Agg_executor_t &agg_exec,
const Agg_view_t &current_arg, const Args &...rest) {
const size_t slice_id = agg_exec.id;
const size_t max_slices = agg_exec.max_slices;
static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
"Argument not an aggregated view");
if constexpr (sizeof...(Args) > 0) {
return std::tuple_cat(
std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(agg_exec, rest...));
} else {
return std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
}

/// Convenience function to perform an aggregated deep copy
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
SourceView_t &source) {
if (agg_exec.sync_aggregation_slices()) {
Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
source);
}
}

/// Convenience function to perform an aggregated deep copy
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
SourceView_t &source, int elements_per_slice) {
if (agg_exec.sync_aggregation_slices()) {
const size_t number_slices = agg_exec.number_slices;
auto target_slices = Kokkos::subview(
target,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
auto source_slices = Kokkos::subview(
source,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
target_slices, source_slices);
}
}

/// Convenience function to launch an aggregated kernel and get a future back
template <typename executor_t, typename TargetView_t, typename SourceView_t>
hpx::shared_future<void> aggregrated_deep_copy_async(
typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
TargetView_t &target, SourceView_t &source) {
const size_t gpu_id = agg_exec.parent.gpu_id;
auto launch_copy_lambda =
[gpu_id](TargetView_t &target, SourceView_t &source,
executor_t &exec) -> hpx::shared_future<void> {
cppuddle::executor_recycling::executor_pool::select_device<
executor_t, cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(gpu_id);
return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
};
return agg_exec.wrap_async(launch_copy_lambda, target, source,
agg_exec.get_underlying_executor());
}

/// Convenience function to launch an aggregated kernel and get a future back
template <typename executor_t, typename TargetView_t, typename SourceView_t>
hpx::shared_future<void> aggregrated_deep_copy_async(
typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
TargetView_t &target, SourceView_t &source, int elements_per_slice) {
const size_t number_slices = agg_exec.number_slices;
const size_t gpu_id = agg_exec.parent.gpu_id;
auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
TargetView_t &target, SourceView_t &source,
executor_t &exec) -> hpx::shared_future<void> {
cppuddle::executor_recycling::executor_pool::select_device<
executor_t,
cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(
gpu_id);
auto target_slices = Kokkos::subview(
target,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
auto source_slices = Kokkos::subview(
source,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
source_slices);
};
return agg_exec.wrap_async(launch_copy_lambda, target, source,
agg_exec.get_underlying_executor());
}

} // namespace kernel_aggregation
} // namespace cppuddle

#endif
Loading