Skip to content

Commit

Permalink
Fix and simplify aggregation example
Browse files Browse the repository at this point in the history
  • Loading branch information
G-071 committed May 31, 2024
1 parent a12fb02 commit 751adac
Showing 1 changed file with 20 additions and 18 deletions.
38 changes: 20 additions & 18 deletions examples/kernel-aggregation-with-hpx-kokkos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,26 @@ using host_executor_t = hpx::kokkos::hpx_executor;
// kernel with a Kokkos::parallel_for
template <typename executor_t, typename view_t>
void kernel_add(executor_t &executor, const size_t entries_per_task,
const view_t &input_a, const view_t &input_b, view_t &output_c)
{
const size_t number_slices, const size_t max_kernels_fused,
const view_t &input_a, const view_t &input_b,
view_t &output_c) {
// Define exeuction policy
auto execution_policy = Kokkos::Experimental::require(
Kokkos::RangePolicy<decltype(executor.instance())>(
executor.instance(), 0, entries_per_task),
executor.instance(), 0, entries_per_task * number_slices),
Kokkos::Experimental::WorkItemProperty::HintLightWeight);

// Run Kernel with execution policy (and give it some name ideally)
Kokkos::parallel_for(
"sample vector add kernel", execution_policy,
KOKKOS_LAMBDA(size_t index) {
output_c[index] = input_a[index] + input_b[index];
const size_t slice_id = index / entries_per_task;
const size_t entry_index = index % entries_per_task;
auto [a_slice, b_slice, c_slice] =
cppuddle::kernel_aggregation::map_views_to_slice(slice_id, max_kernels_fused, input_a,
input_b, output_c);

c_slice[entry_index] = a_slice[entry_index] + b_slice[entry_index];
});
}

Expand Down Expand Up @@ -182,10 +189,7 @@ void launch_gpu_kernel_task(
aggregation_region_name, device_executor_t,
void>(max_kernels_fused, [&](auto slice_id, auto number_slices,
auto &aggregation_executor) {
cppuddle::kernel_aggregation::allocator_slice<float_t, host_allocator_t,
device_executor_t>
alloc_host = aggregation_executor
.template make_allocator<float_t, host_allocator_t>();
auto alloc_host = aggregation_executor. template make_allocator<float_t, host_allocator_t>();
assert(number_slices >= 1);
assert(number_slices < max_kernels_fused);

Expand All @@ -211,11 +215,9 @@ void launch_gpu_kernel_task(
host_b_slice[i] = 2.0;
}

cppuddle::kernel_aggregation::allocator_slice<float_t, device_allocator_t,
device_executor_t>
alloc_device = aggregation_executor
.template make_allocator<float_t, device_allocator_t>();
aggregated_device_view_t device_a(alloc_device, entries_per_task * max_kernels_fused);
auto alloc_device = aggregation_executor. template make_allocator<float_t, device_allocator_t>();
aggregated_device_view_t device_a(alloc_device,
entries_per_task * max_kernels_fused);
aggregated_device_view_t device_b(alloc_device, entries_per_task * max_kernels_fused);
aggregated_device_view_t device_c(alloc_device, entries_per_task * max_kernels_fused);

Expand All @@ -225,20 +227,20 @@ void launch_gpu_kernel_task(
cppuddle::kernel_aggregation::aggregated_deep_copy(aggregation_executor,
device_b, host_b);

if (aggregation_executor.sync_aggregation_slices()) {
if (aggregation_executor.sync_aggregation_slices()) { // Only launch one per team
kernel_add(aggregation_executor.get_underlying_executor(),
entries_per_task, device_a, device_b, device_c);
entries_per_task, number_slices, max_kernels_fused, device_a, device_b, device_c);
}

auto transfer_fut =
cppuddle::kernel_aggregation::aggregrated_deep_copy_async<device_executor_t>(
aggregation_executor, host_c, device_c);
cppuddle::kernel_aggregation::aggregrated_deep_copy_async<
device_executor_t>(aggregation_executor, host_c, device_c);
transfer_fut.get();

// 5. Host-side postprocessing (usually: communication, here: check
// correctness)
for (size_t i = 0; i < entries_per_task; i++) {
if (host_c[i] != 1.0 + 2.0) {
if (host_c_slice[i] != 1.0 + 2.0) {
std::cerr << "Task " << task_id << " contained wrong results!!"
<< std::endl;
break;
Expand Down

0 comments on commit 751adac

Please sign in to comment.