From 801766e76e73b9fa10d17fc3de520c9c2942a9d1 Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Thu, 4 Jul 2024 14:49:33 +0200 Subject: [PATCH] Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range (#1130) * wrap 3D flat loop abstractions * add 4D loop and test * add specialization for const int & * added mdrange loops to par_reduce tests * refactor flatloop specialization * clean up * formatting * linting * templating functor index types * moved to a single functor * Update CHANGELOG.md * remove unused Nj --------- Co-authored-by: Cloud User Co-authored-by: Luke Roberts --- CHANGELOG.md | 1 + src/kokkos_abstraction.hpp | 85 +++++++++++++++++++++++---------- tst/unit/kokkos_abstraction.cpp | 81 +++++++++++++++++++++++++++++-- 3 files changed, 137 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e391c6d746d..df70d7badbbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Current develop ### Added (new features/APIs/variables/...) +- [[PR 1130]](https://github.com/parthenon-hpc-lab/parthenon/pull/1130) Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range - [[PR 1119]](https://github.com/parthenon-hpc-lab/parthenon/pull/1119) Formalize MeshData partitioning. - [[PR 1128]](https://github.com/parthenon-hpc-lab/parthenon/pull/1128) Add cycle and nbtotal to hst - [[PR 1099]](https://github.com/parthenon-hpc-lab/parthenon/pull/1099) Functionality for outputting task graphs in GraphViz format. diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 2d74ce00932a..ca8c59ffe12e 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -258,6 +258,35 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } +template +class FlatFunctor; + +template +auto MakeFlatFunctor(F &function, Args... args) { + return FlatFunctor(function, std::forward(args)...); +} + +template +class FlatFunctor { + int NjNi, Ni, kl, jl, il; + Function function; + + public: + FlatFunctor(const Function _function, const int _NjNi, const int _Ni, const int _kl, + const int _jl, const int _il) + : function(_function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, FArgs &&...fargs) const { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); + } +}; + // 3D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -270,18 +299,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Ni = iu - il + 1; const int NkNjNi = Nk * Nj * Ni; const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i); - }, - std::forward(args)...); + kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + MakeFlatFunctor(function, NjNi, Ni, kl, jl, il), + std::forward(args)...); } // 3D loop using MDRange loops @@ -372,6 +392,30 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } +template +class FlatFunctor { + int NkNjNi, NjNi, Ni, nl, kl, jl, il; + Function function; + + public: + FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Ni, + const int _nl, const int _kl, const int _jl, const int _il) + : function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Ni(_Ni), nl(_nl), kl(_kl), + jl(_jl), il(_il) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, FArgs &&...fargs) const { + int n = idx / NkNjNi; + int k = (idx - n * NkNjNi) / NjNi; + int j = (idx - n * NkNjNi - k * NjNi) / Ni; + int i = idx - n * NkNjNi - k * NjNi - j * Ni; + n += nl; + k += kl; + j += jl; + i += il; + function(n, k, j, i, std::forward(fargs)...); + } +}; + // 4D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -387,20 +431,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int NnNkNjNi = Nn * Nk * Nj * Ni; const int NkNjNi = Nk * Nj * Ni; const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i); - }, - std::forward(args)...); + kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), + MakeFlatFunctor(function, NkNjNi, NjNi, Ni, nl, kl, jl, il), + std::forward(args)...); } // 4D loop using MDRange loops diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 0adc61c441b0..ae0e3fcb79e8 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -500,12 +500,85 @@ bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) { return total == test_tot; } +template +bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) { + constexpr int N = 10; + parthenon::ParArray3D buffer("Testing buffer", N, N, N); + // Initialize data + parthenon::par_for( + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, + N - 1, KOKKOS_LAMBDA(const int k, const int j, const int i) { + buffer(k, j, i) = i + j + k; + }); + int tot = 0; + for (int k = 0; k < N; ++k) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + tot += i + j + k; + } + } + } + int test_tot = 0; + parthenon::par_reduce( + loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, + KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { t += i + j + k; }, + Kokkos::Sum(test_tot)); + return tot == test_tot; +} + +template +bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) { + constexpr int N = 10; + parthenon::ParArray4D buffer("Testing buffer", N, N, N, N); + // Initialize data + parthenon::par_for( + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, + N - 1, 0, N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { + buffer(n, k, j, i) = i + j + k + n; + }); + int tot = 0; + for (int n = 0; n < N; ++n) { + for (int k = 0; k < N; ++k) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + tot += i + j + k + n; + } + } + } + } + int test_tot = 0; + parthenon::par_reduce( + loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0, + N - 1, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) { + t += i + j + k + n; + }, + Kokkos::Sum(test_tot)); + return tot == test_tot; +} + TEST_CASE("Parallel reduce", "[par_reduce]") { auto default_exec_space = DevExecSpace(); - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, + SECTION("1D loops") { + REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + if constexpr (std::is_same::value) { + REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, + default_exec_space) == true); + } + } + + SECTION("3D loops") { + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag, + default_exec_space) == true); + } + + SECTION("4D loops") { + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag, default_exec_space) == true); } }