From b5594528e31cbcb3bd5d2aaa0f944d36d10b2c8f Mon Sep 17 00:00:00 2001 From: Jonah Miller Date: Fri, 22 Nov 2024 11:28:20 -0700 Subject: [PATCH 1/2] Update Kokkos version to 4.4.1 (#1191) * bump Kokkos version * changelog * Update Kokkos version to 4.4.1 * Fix View of View dealloc * Move view_alloc to host * Temp disable rocthrust on amd ci container * Fix doc creation. Rollback to ubuntu22-04 * Fix more view of views * Chasing more ViewOfView on host * Fix ViewAlloc func and add doc * Disable Ascent in testing * Revert "Disable Ascent in testing" This reverts commit ebbcc1704ed2c68c99808cf099eef41396b2d5ce. * Disable CUDA IPC * Fix more view of views * Fix missig update of private var * Make linter happy * Add ParArray#DRaw and hide Kokkos interface more * Add doc * Debug build in check compilers without symbols * Only check compilers prior to merge * Fix typo * Bump rocm image and change arch for debug hip build * Fix comment that broke the command line --------- Co-authored-by: Jonah Miller Co-authored-by: Patrick Mullen Co-authored-by: Philipp Grete --- .github/workflows/check-compilers.yml | 24 +++++++--- .github/workflows/ci-extended.yml | 4 +- .github/workflows/ci-short.yml | 8 ++-- CHANGELOG.md | 24 +++++----- README.md | 2 +- cmake/machinecfg/GitHubActions.cmake | 3 ++ doc/sphinx/src/development.rst | 41 +++++++++++++++- external/Kokkos | 2 +- scripts/docker/Dockerfile.hip-rocm | 14 +++--- src/bvals/bvals.hpp | 3 -- src/bvals/comms/bnd_info.cpp | 4 +- src/bvals/comms/bnd_info.hpp | 7 +-- src/bvals/comms/bvals_utils.hpp | 5 +- src/interface/mesh_data.hpp | 5 +- src/interface/sparse_pack_base.cpp | 9 ++-- src/interface/sparse_pack_base.hpp | 5 +- src/interface/swarm_pack_base.hpp | 15 +++--- src/interface/variable_pack.hpp | 36 +++++++------- src/kokkos_abstraction.hpp | 67 +++++++++++++++++++++------ src/mesh/meshblock_pack.hpp | 4 +- src/parthenon_array_generic.hpp | 4 ++ tst/unit/test_pararrays.cpp | 8 ++-- 22 files changed, 199 insertions(+), 95 deletions(-) diff --git a/.github/workflows/check-compilers.yml b/.github/workflows/check-compilers.yml index 4a445e79347a..5a55810971d9 100644 --- a/.github/workflows/check-compilers.yml +++ b/.github/workflows/check-compilers.yml @@ -1,6 +1,14 @@ name: Check compilers -on: [push, pull_request] +on: + # run every day at 06:00 UTC + schedule: + - cron: '0 6 * * *' + # when triggered manually + workflow_dispatch: + # when auto merge is enabled (hack to make sure it's run before merging) + pull_request: + types: [auto_merge_enabled] # Cancel "duplicated" workflows triggered by pushes to internal # branches with associated PRs. @@ -14,7 +22,7 @@ jobs: strategy: matrix: cxx: ['g++', 'clang++-15'] - cmake_build_type: ['Release', 'Debug'] + cmake_build_type: ['Release', 'DbgNoSym'] device: ['cuda', 'host'] parallel: ['serial', 'mpi'] exclude: @@ -23,7 +31,7 @@ jobs: # https://github.com/lanl/parthenon/issues/630 - cxx: clang++-15 device: cuda - cmake_build_type: Debug + cmake_build_type: DbgNoSym runs-on: ubuntu-latest container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent @@ -48,12 +56,12 @@ jobs: strategy: matrix: cxx: ['hipcc'] - cmake_build_type: ['Release', 'Debug'] + cmake_build_type: ['Release', 'DbgNoSym'] device: ['hip'] parallel: ['serial', 'mpi'] runs-on: ubuntu-latest container: - image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 + image: ghcr.io/parthenon-hpc-lab/rocm6.2-mpi-hdf5 env: CMAKE_GENERATOR: Ninja steps: @@ -61,12 +69,16 @@ jobs: with: submodules: 'true' - name: CMake + # Manually chaning the arch for this (debug) build as the + # -O0 option causes compiler issue for the navi 1030 GPU at + # compile time, see https://github.com/parthenon-hpc-lab/parthenon/pull/1191#issuecomment-2492035364 run: | cmake -B builddir \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ - -DMACHINE_VARIANT=${{ matrix.device }}_${{ matrix.parallel }} + -DMACHINE_VARIANT=${{ matrix.device }}_${{ matrix.parallel }} \ + -DKokkos_ARCH_AMD_GFX90A=ON -DKokkos_ARCH_NAVI1030=OFF - name: Build run: | cmake --build builddir --parallel 2 diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2eb..1dd39c782477 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -34,7 +36,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411ee..7e0fd8bf759a 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -22,7 +24,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -47,7 +49,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -79,7 +81,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index a4e0a5ca9cd2..8336392611a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ ### Added (new features/APIs/variables/...) - [[PR 1103]](https://github.com/parthenon-hpc-lab/parthenon/pull/1103) Add sparsity to vector wave equation test -- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185/files) Bugfix to particle defragmentation +- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185) Bugfix to particle defragmentation - [[PR 1184]](https://github.com/parthenon-hpc-lab/parthenon/pull/1184) Fix swarm block neighbor indexing in 1D, 2D - [[PR 1183]](https://github.com/parthenon-hpc-lab/parthenon/pull/1183) Fix particle leapfrog example initialization data - [[PR 1179]](https://github.com/parthenon-hpc-lab/parthenon/pull/1179) Make a global variable for whether simulation is a restart @@ -12,12 +12,12 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) +- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1 - [[PR 1209]](https://github.com/parthenon-hpc-lab/parthenon/pull/1209) Ordered history output - [[PR 1206]](https://github.com/parthenon-hpc-lab/parthenon/pull/1206) Leapfrog fix -- [[PR1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image -- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag +- [[PR 1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image +- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize -- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls @@ -35,7 +35,7 @@ ### Incompatibilities (i.e. breaking changes) -- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag +- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag ## Release 24.08 Date: 2024-08-30 @@ -159,12 +159,12 @@ Date: 2024-03-21 - [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades ### Fixed (not changing behavior/API/variables/...) -- [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool -- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code -- [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools -- [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes -- [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing -- [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check +- [[PR 1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool +- [[PR 1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code +- [[PR 992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools +- [[PR 988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes +- [[PR 986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing +- [[PR 978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check ### Infrastructure (changes irrelevant to downstream codes) - [[PR 1027]](https://github.com/parthenon-hpc-lab/parthenon/pull/1027) Refactor RestartReader as abstract class @@ -231,7 +231,7 @@ Date: 2023-11-16 - [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model ### Removed (removing behavior/API/varaibles/...) -- [[PR 930](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage. +- [[PR 930]](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage. ## Release 0.8.0 diff --git a/README.md b/README.md index ed6a1bb05a16..b874e5172889 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr * CMake 3.16 or greater * C++17 compatible compiler -* Kokkos 4.0.1 or greater +* Kokkos 4.4.1 or greater ## Optional (enabling features) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index 663dcb38d682..b524483d996c 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -19,6 +19,9 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ") # common options set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks") +set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.") + +set(CMAKE_CXX_FLAGS_DBGNOSYM "-O0" CACHE STRING "Debug build without symbols") set(MACHINE_CXX_FLAGS "") if (${MACHINE_VARIANT} MATCHES "cuda") diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst index dbab91d8d5ce..79ec79bd8e61 100644 --- a/doc/sphinx/src/development.rst +++ b/doc/sphinx/src/development.rst @@ -21,7 +21,10 @@ Kokkos wrappers/abstractions - ``par_for`` wrappers use inclusive bounds, i.e., the loop will include the last index given - ``ParArrayND`` arrays by default allocate on the *device* using - default precision configured + default precision configured and come with a `State` that can + be used to store additional metadata. +- ``ParArray#DRaw`` directly map to Kokkos ``Views`` that are allocated + on *device* using default precision. - To create an array on the host with identical layout to the device array either use @@ -62,6 +65,42 @@ parallelism interface that is needed for managing memory cached in tightly nested loops. The wrappers are documented :ref:`here `. +View of Views +------------- + +Special care needs to be taken when working with a ``View`` of ``Views``. + +To repeat the Kokkos documenation: `Don't use them `__ + +But if you have to (which is the case in some places inside Parthenon) +then follow this pattern: + +.. code:: c++ + + ParArray1DRaw> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10); + +The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added, +which results in the (inner ``View`` ) deallocators being called on the host (rather than on +the device by default). +Also note the use of the "raw" ``ParArray1DRaw``, which directly maps to a Kokkos ``View`` +(that is required to process the allocation property as this interface is not exposed +in the more generic ``ParArrayND``). + +Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional +property for the same reason. + +.. code:: c++ + + // explicit theoretical example -- don't use this + auto view_of_pararrays_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays); + + // but instead use this interface provided by Parthenon: + auto view_of_pararrays_h = create_view_of_view_mirror(view_of_pararrays); + + +Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon). + The need for reductions within function handling ``MeshBlock`` data ------------------------------------------------------------------- diff --git a/external/Kokkos b/external/Kokkos index 62d2b6c879b7..15dc143e5f39 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6 +Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8 diff --git a/scripts/docker/Dockerfile.hip-rocm b/scripts/docker/Dockerfile.hip-rocm index 5d9d5c765b6a..2d1a3504af13 100644 --- a/scripts/docker/Dockerfile.hip-rocm +++ b/scripts/docker/Dockerfile.hip-rocm @@ -1,4 +1,4 @@ -FROM rocm/dev-ubuntu-20.04:5.4.3 +FROM rocm/dev-ubuntu-24.04:6.2 RUN apt-get clean && apt-get update -y && \ DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib lcov curl cmake ninja-build openmpi-bin libopenmpi-dev && \ @@ -14,12 +14,10 @@ RUN cd /tmp && \ cd / && \ rm -rf /tmp/hdf5-1.10.8* -# "mpic++ --showme" forgets open-pal in Ubuntu 20.04 + OpenMPI 4.0.3 -# https://bugs.launchpad.net/ubuntu/+source/openmpi/+bug/1941786 -# https://github.com/open-mpi/ompi/issues/9317 -ENV LDFLAGS="-lopen-pal" - RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 -# uid 1000 maps to the one running the container on the CI host -RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci +# Latest image has default user with uid 1000 (which maps to the one running the container on the CI host +# Need to add user to the group that can access the GPU +RUN usermod -a -G render ubuntu + +WORKDIR /home/ubuntu diff --git a/src/bvals/bvals.hpp b/src/bvals/bvals.hpp index bc6ed0ebed56..aaa4da22c0f0 100644 --- a/src/bvals/bvals.hpp +++ b/src/bvals/bvals.hpp @@ -101,9 +101,6 @@ class BoundarySwarm : public BoundaryCommunication { explicit BoundarySwarm(std::weak_ptr pmb, const std::string &label); ~BoundarySwarm() = default; - std::vector> vars_int; - std::vector> vars_real; - // (usuallly the std::size_t unsigned integer type) std::vector::size_type bswarm_index; diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index 736992260913..2b4948278c38 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -40,8 +40,8 @@ namespace parthenon { void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) { - prores_info = ParArray1D("prores_info", n_regions); - prores_info_h = Kokkos::create_mirror_view(prores_info); + prores_info = ProResInfoArr_t(ViewOfViewAlloc("prores_info"), n_regions); + prores_info_h = create_view_of_view_mirror(prores_info); int nref_funcs = pkg->NumRefinementFuncs(); // Note that assignment of Kokkos views resets them, but // buffer_subset_sizes is a std::vector. It must be cleared, then diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index e6214ceba322..43dfb06ef478 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -26,6 +26,7 @@ #include "bvals/neighbor_block.hpp" #include "coordinates/coordinates.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/forest/logical_coordinate_transformation.hpp" #include "utils/communication_buffer.hpp" @@ -127,11 +128,11 @@ struct ProResInfo { int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v); -using BndInfoArr_t = ParArray1D; +using BndInfoArr_t = ParArray1DRaw; using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror; -using ProResInfoArr_t = ParArray1D; -using ProResInfoArrHost_t = typename ParArray1D::HostMirror; +using ProResInfoArr_t = ParArray1DRaw; +using ProResInfoArrHost_t = typename ProResInfoArr_t::HostMirror; class StateDescriptor; struct ProResCache_t { ProResInfoArr_t prores_info{}; diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index f185c1207747..0c406b33c8c2 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -28,6 +28,7 @@ #include "bvals/comms/bnd_info.hpp" #include "bvals/comms/bvals_in_one.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/mesh.hpp" #include "mesh/meshblock.hpp" @@ -215,8 +216,8 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = BndInfoArr_t("bnd_info", nbound); - cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); + cache.bnd_info = BndInfoArr_t(ViewOfViewAlloc("bnd_info"), nbound); + cache.bnd_info_h = create_view_of_view_mirror(cache.bnd_info); // prolongation/restriction sub-sets // TODO(JMM): Right now I exclude fluxcorrection boundaries but if diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp index 14ae3959c32a..6b6518ee83b9 100644 --- a/src/interface/mesh_data.hpp +++ b/src/interface/mesh_data.hpp @@ -26,6 +26,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/swarm_pack_base.hpp" #include "interface/variable_pack.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/meshblock.hpp" #include "mesh/meshblock_pack.hpp" @@ -149,8 +150,8 @@ const MeshBlockPack

&PackOnMesh(M &map, BlockDataList_t &block_data_, } if (make_new_pack) { - ParArray1D

packs("MeshData::PackVariables::packs", nblocks); - auto packs_host = Kokkos::create_mirror_view(packs); + ParArray1DRaw

packs(ViewOfViewAlloc("MeshData::PackVariables::packs"), nblocks); + auto packs_host = create_view_of_view_mirror(packs); for (size_t i = 0; i < nblocks; i++) { const auto &pack = packing_function(block_data_[i], this_map, this_key); diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index 2a7a5b70c41c..1965bdc45103 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -30,6 +30,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { namespace impl { @@ -151,8 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, } else if (contains_face_or_edge) { leading_dim += 2; } - pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size); - pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, pack.nblocks_, max_size); + pack.pack_h_ = create_view_of_view_mirror(pack.pack_); // For non-flat packs, shape of pack is type x block x var x k x j x i // where type here might be a flux. @@ -167,8 +168,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, pack.block_props_ = block_props_t("block_props", nblocks, 27 + 1); pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_); - pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks); - auto coords_h = Kokkos::create_mirror_view(pack.coords_); + pack.coords_ = coords_t(ViewOfViewAlloc("coords"), desc.flat ? max_size : nblocks); + auto coords_h = create_view_of_view_mirror(pack.coords_); // Fill the views int idx = 0; diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp index 0deca487a20a..827bf54f7c9e 100644 --- a/src/interface/sparse_pack_base.hpp +++ b/src/interface/sparse_pack_base.hpp @@ -30,6 +30,7 @@ #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -55,13 +56,13 @@ class SparsePackBase { using alloc_t = std::vector; using include_t = std::vector; - using pack_t = ParArray3D>; + using pack_t = ParArray3DRaw>; using pack_h_t = typename pack_t::HostMirror; using bounds_t = ParArray3D; using bounds_h_t = typename bounds_t::HostMirror; using block_props_t = ParArray2D; using block_props_h_t = typename block_props_t::HostMirror; - using coords_t = ParArray1D>; + using coords_t = ParArray1DRaw>; // Returns a SparsePackBase object that is either newly created or taken // from the cache in pmd. The cache itself handles the all of this logic diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp index 0733aa51f329..337e219f4962 100644 --- a/src/interface/swarm_pack_base.hpp +++ b/src/interface/swarm_pack_base.hpp @@ -28,6 +28,7 @@ #include "interface/state_descriptor.hpp" #include "interface/swarm_device_context.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -43,10 +44,10 @@ class SwarmPackBase { SwarmPackBase() = default; virtual ~SwarmPackBase() = default; - using pack_t = ParArray3D>; + using pack_t = ParArray3DRaw>; using bounds_t = ParArray3D; - using contexts_t = ParArray1D; - using contexts_h_t = typename ParArray1D::HostMirror; + using contexts_t = ParArray1DRaw; + using contexts_h_t = typename contexts_t::HostMirror; using max_active_indices_t = ParArray1D; using desc_t = impl::SwarmPackDescriptor; using idx_map_t = std::unordered_map; @@ -108,8 +109,8 @@ class SwarmPackBase { // Allocate the views int leading_dim = 1; - pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size); - auto pack_h = Kokkos::create_mirror_view(pack.pack_); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, nblocks, max_size); + auto pack_h = create_view_of_view_mirror(pack.pack_); pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar); auto bounds_h = Kokkos::create_mirror_view(pack.bounds_); @@ -153,8 +154,8 @@ class SwarmPackBase { Kokkos::deep_copy(pack.pack_, pack_h); Kokkos::deep_copy(pack.bounds_, bounds_h); - pack.contexts_ = contexts_t("contexts", nblocks); - pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_); + pack.contexts_ = contexts_t(ViewOfViewAlloc("contexts"), nblocks); + pack.contexts_h_ = create_view_of_view_mirror(pack.contexts_); pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks); pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1); BuildSupplemental(pmd, desc, pack); diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 037731093ce1..409290419dd7 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -244,10 +244,10 @@ class PackIndexMap { }; template -using ViewOfParArrays = ParArray1D>; +using ViewOfParArrays = ParArray1DRaw>; template -using ViewOfParArrays1D = ParArray1D>; +using ViewOfParArrays1D = ParArray1DRaw>; // forward declaration template @@ -570,10 +570,10 @@ void FillVarView(const VariableVector &vars, int vsize, bool coarse, assert(vsize == sparse_id_out.size()); assert(vsize == vector_component_out.size()); - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); - auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out); - auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out); + auto host_cv = create_view_of_view_mirror(cv_out); + auto host_sp = Kokkos::create_mirror_view(sparse_id_out); + auto host_vc = Kokkos::create_mirror_view(vector_component_out); + auto host_al = Kokkos::create_mirror_view(allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -634,7 +634,7 @@ void FillSwarmVarView(const vpack_types::SwarmVarList &vars, ViewOfParArrays1D &cv_out, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); + auto host_cv = create_view_of_view_mirror(cv_out); int vindex = 0; for (const auto v : vars) { @@ -675,10 +675,10 @@ void FillFluxViews(const VariableVector &vars, const int ndim, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out); - auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out); - auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out); + auto host_f1 = create_view_of_view_mirror(f1_out); + auto host_f2 = create_view_of_view_mirror(f2_out); + auto host_f3 = create_view_of_view_mirror(f3_out); + auto host_al = Kokkos::create_mirror_view(flux_allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -755,10 +755,11 @@ VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, } // make the outer view - ViewOfParArrays cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1)); - ViewOfParArrays f1("MakeFluxPack::f1", fsize); - ViewOfParArrays f2("MakeFluxPack::f2", fsize); - ViewOfParArrays f3("MakeFluxPack::f3", fsize); + ViewOfParArrays cv(ViewOfViewAlloc("MakeFluxPack::cv"), + vsize * (extra_components ? 3 : 1)); + ViewOfParArrays f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize); + ViewOfParArrays f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize); + ViewOfParArrays f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize); ParArray1D flux_allocated("MakePack::allocated", fsize); ParArray1D sparse_id("MakeFluxPack::sparse_id", vsize); ParArray1D vector_component("MakeFluxPack::vector_component", vsize); @@ -809,7 +810,8 @@ VariablePack MakePack(const VarListWithKeys &var_list, bool coarse, } // make the outer view - ViewOfParArrays cv("MakePack::cv", vsize * (extra_components ? 3 : 1)); + ViewOfParArrays cv(ViewOfViewAlloc("MakePack::cv"), + vsize * (extra_components ? 3 : 1)); ParArray1D sparse_id("MakePack::sparse_id", vsize); ParArray1D vector_component("MakePack::vector_component", vsize); ParArray1D allocated("MakePack::allocated", vsize); @@ -842,7 +844,7 @@ SwarmVariablePack MakeSwarmPack(const vpack_types::SwarmVarList &vars, } // make the outer view - ViewOfParArrays1D cv("MakePack::cv", vsize); + ViewOfParArrays1D cv(ViewOfViewAlloc("MakePack::cv"), vsize); std::array cv_size{0, 0}; if (vsize > 0) { diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 8fa89f82e95e..701a5121ec04 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -72,30 +72,45 @@ using BufArray1D = Kokkos::View; template using buf_pool_t = ObjectPool>; +// Raw ParArrays (that directly map a view) +template +using ParArray0DRaw = Kokkos::View; +template +using ParArray1DRaw = Kokkos::View; +template +using ParArray2DRaw = Kokkos::View; +template +using ParArray3DRaw = Kokkos::View; +template +using ParArray4DRaw = Kokkos::View; +template +using ParArray5DRaw = Kokkos::View; +template +using ParArray6DRaw = Kokkos::View; +template +using ParArray7DRaw = Kokkos::View; +template +using ParArray8DRaw = Kokkos::View; + +// Standard ParArrays that wrap a raw view and a Stage template -using ParArray0D = ParArrayGeneric, State>; +using ParArray0D = ParArrayGeneric, State>; template -using ParArray1D = ParArrayGeneric, State>; +using ParArray1D = ParArrayGeneric, State>; template -using ParArray2D = ParArrayGeneric, State>; +using ParArray2D = ParArrayGeneric, State>; template -using ParArray3D = - ParArrayGeneric, State>; +using ParArray3D = ParArrayGeneric, State>; template -using ParArray4D = - ParArrayGeneric, State>; +using ParArray4D = ParArrayGeneric, State>; template -using ParArray5D = - ParArrayGeneric, State>; +using ParArray5D = ParArrayGeneric, State>; template -using ParArray6D = - ParArrayGeneric, State>; +using ParArray6D = ParArrayGeneric, State>; template -using ParArray7D = - ParArrayGeneric, State>; +using ParArray7D = ParArrayGeneric, State>; template -using ParArray8D = - ParArrayGeneric, State>; +using ParArray8D = ParArrayGeneric, State>; // Host mirrors template @@ -1035,6 +1050,28 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons reduction); } +// For ViewOfView we need to call the destructor of the inner views on +// the host and not on the device (which would happen by default). +// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is +// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be +// passed when calling `create_mirror_view`, which is automatically handled by the +// create_view_of_view_mirror() function below. +template +auto ViewOfViewAlloc(const std::string &label) { + if constexpr (std::is_same_v) { + return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); + } else { + return Kokkos::view_alloc(label); + } +} + +// Returns a host mirror view with the `SequentialHostInit` property for proper +// deallocations of inner views in views of views. +template +auto create_view_of_view_mirror(T view) { + return Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view); +} + // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7 template diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp index 5669e112109b..b131d6d349d4 100644 --- a/src/mesh/meshblock_pack.hpp +++ b/src/mesh/meshblock_pack.hpp @@ -38,7 +38,7 @@ class MeshBlockPack { using pack_type = T; MeshBlockPack() = default; - MeshBlockPack(const ParArray1D view, const std::array dims) + MeshBlockPack(const ParArray1DRaw view, const std::array dims) : v_(view), dims_(dims), ndim_((dims[2] > 1 ? 3 : (dims[1] > 1 ? 2 : 1))) {} KOKKOS_FORCEINLINE_FUNCTION @@ -85,7 +85,7 @@ class MeshBlockPack { const Coordinates_t &GetCoords(const int i) const { return v_(i).GetCoords(); } private: - ParArray1D v_; + ParArray1DRaw v_; std::array dims_; int ndim_; }; diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp index d527707f9070..ac38b6cb5e5f 100644 --- a/src/parthenon_array_generic.hpp +++ b/src/parthenon_array_generic.hpp @@ -221,6 +221,8 @@ class ParArrayGeneric : public State { // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6); } + // TODO(PG?) Can we use concepts here to add a + // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template auto GetMirror(MemSpace const &memspace) { auto mirror = Kokkos::create_mirror_view(memspace, data_); @@ -333,6 +335,8 @@ inline auto subview(std::index_sequence, return parthenon::ParArrayGeneric(v, arr); } +// TODO(PG?) Can we use concepts here to add a +// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template inline auto create_mirror_view_and_copy(Space const &space, const parthenon::ParArrayGeneric &arr) { diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index e79927c13b20..3911fc70632d 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -451,8 +451,8 @@ TEST_CASE("ParArray state", "[ParArrayND]") { } GIVEN("An array of ParArrays filled with the values contained in their state") { - parthenon::ParArray1D pack("test pack", NS); - auto pack_h = Kokkos::create_mirror_view(pack); + parthenon::ParArray1DRaw pack(parthenon::ViewOfViewAlloc("test pack"), NS); + auto pack_h = create_view_of_view_mirror(pack); for (int b = 0; b < NS; ++b) { state_t state(static_cast(b)); @@ -527,6 +527,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { // view of views. See: // https://github.com/kokkos/kokkos/wiki/View#6232-whats-the-problem-with-a-view-of-views + // TODO(PG) depending on the results of the view of view discussion, we should add + // destructor or ViewOfViewAlloc with SequentialHostInit using view_3d_t = Kokkos::View; using arrays_t = Kokkos::View *, UVMSpace>; @@ -544,7 +546,7 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { new (&views[n]) view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N); auto a_h = arrays(n).GetHostMirror(); - auto v_h = Kokkos::create_mirror_view(views(n)); + auto v_h = parthenon::create_view_of_view_mirror(views(n)); for (int k = 0; k < N; k++) { for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) { From 2fc423a17543498b25652fa26e7827af29de8aa6 Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Mon, 25 Nov 2024 14:24:55 +0100 Subject: [PATCH 2/2] [trivial] remove inline from WriteTaskGraph (#1211) * remove inline from WriteTaskGraph * Update CHANGELOG.md --- CHANGELOG.md | 1 + src/tasks/tasks.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8336392611a8..3abf18c52538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls ### Fixed (not changing behavior/API/variables/...) +- [[PR 1211]](https://github.com/parthenon-hpc-lab/parthenon/pull/1211) remove inline from WriteTaskGraph - [[PR 1188]](https://github.com/parthenon-hpc-lab/parthenon/pull/1188) Fix hdf5 output issue for metadata none variables, update test. - [[PR 1170]](https://github.com/parthenon-hpc-lab/parthenon/pull/1170) Fixed incorrect initialization of array by a const not constexpr - [[PR 1189]](https://github.com/parthenon-hpc-lab/parthenon/pull/1189) Address CUDA MPI/ICP issue with Kokkos <=4.4.1 diff --git a/src/tasks/tasks.cpp b/src/tasks/tasks.cpp index a231c43bf380..1543b8d24470 100644 --- a/src/tasks/tasks.cpp +++ b/src/tasks/tasks.cpp @@ -70,8 +70,8 @@ bool Task::ready() { return go; } -inline std::ostream &WriteTaskGraph(std::ostream &stream, - const std::vector> &tasks) { +std::ostream &WriteTaskGraph(std::ostream &stream, + const std::vector> &tasks) { #ifndef HAS_CXX_ABI std::cout << "Warning: task graph output will not include function" "signatures since libcxxabi is unavailable.\n";