Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix remaining swarm D->H->D copies #1145

Merged
merged 32 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
60b216c
working
brryan Jul 30, 2024
3867965
Clean up implicit captures
brryan Jul 31, 2024
ed99964
OK something isnt working with updating empty_indices
brryan Jul 31, 2024
f19156b
Creating indices works but defragmentation is broken
brryan Jul 31, 2024
bf06351
Defrag works
brryan Jul 31, 2024
def49d2
Switch to persistent scratch memory
brryan Jul 31, 2024
e9b98c0
Clean up
brryan Jul 31, 2024
4a4110e
Fix GPU issues
brryan Jul 31, 2024
1cf0b8f
Fix compile error by cleaning up code
brryan Jul 31, 2024
b4487b1
Remove unnecessary check against non-null user swarm BCs
brryan Jul 31, 2024
d68e6dd
Remove unused function
brryan Jul 31, 2024
93efdfc
Formatting
brryan Jul 31, 2024
5e5a31a
Fiddle with send logic
brryan Jul 31, 2024
ee95934
Merge branch 'develop' into brryan/more_swarm_prefix_sums
lroberts36 Aug 1, 2024
b2891de
Merge branch 'develop' of github.com:lanl/parthenon into brryan/more_…
brryan Aug 20, 2024
51cc2f1
Merge branch 'brryan/more_swarm_prefix_sums' of github.com:lanl/parth…
brryan Aug 20, 2024
2a6a8ed
Perform swarm boundary logic on device (#1154)
brryan Aug 20, 2024
7326394
Oops ParArray1D isn't a host array when compiled for device
brryan Aug 20, 2024
36c6bc0
implicit this->
brryan Aug 20, 2024
753ca05
bug in nrecvd particles with 1 particle received...
brryan Aug 20, 2024
ed10e33
Found the bug
brryan Aug 21, 2024
85d7bd5
Fixed bug, cleaned up
brryan Aug 21, 2024
f17c013
kokkos parallel_scan -> par_scan
brryan Aug 21, 2024
fb536d1
Merge branch 'develop' into brryan/more_swarm_prefix_sums
brryan Aug 22, 2024
e4092f6
Merge branch 'develop' into brryan/more_swarm_prefix_sums
brryan Aug 23, 2024
87192a7
Fix types, clean up code
brryan Aug 26, 2024
5d93cac
Merge branch 'brryan/more_swarm_prefix_sums' of github.com:lanl/parth…
brryan Aug 26, 2024
fab017d
Merge branch 'develop' into brryan/more_swarm_prefix_sums
brryan Aug 26, 2024
fab834b
Merge branch 'develop' into brryan/more_swarm_prefix_sums
brryan Aug 26, 2024
a205b76
Add warning if using swarms but Real != double
brryan Aug 27, 2024
27879e9
Merge branch 'brryan/more_swarm_prefix_sums' of github.com:lanl/parth…
brryan Aug 27, 2024
5c46e5a
Cleanup from debugging
brryan Aug 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
- [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts

### Fixed (not changing behavior/API/variables/...)
- [[PR 1145]](https://github.com/parthenon-hpc-lab/parthenon/pull/1145) Fix remaining swarm D->H->D copies
- [[PR 1132]](https://github.com/parthenon-hpc-lab/parthenon/pull/1132) Fix regional dependencies for iterative task lists and make solvers work for arbirtrary MeshData partitioning
- [[PR 1139]](https://github.com/parthenon-hpc-lab/parthenon/pull/1139) only add --expt-relaxed-constexpr for COMPILE_LANGUAGE:CXX
- [[PR 1131]](https://github.com/parthenon-hpc-lab/parthenon/pull/1131) Make deallocation of fine and sparse fields work
Expand Down
249 changes: 136 additions & 113 deletions src/interface/swarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,38 +65,31 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {

Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_pool_in)
: label_(label), m_(metadata), nmax_pool_(nmax_pool_in), mask_("mask", nmax_pool_),
marked_for_removal_("mfr", nmax_pool_), block_index_("block_index_", nmax_pool_),
marked_for_removal_("mfr", nmax_pool_),
empty_indices_("empty_indices_", nmax_pool_),
block_index_("block_index_", nmax_pool_),
neighbor_indices_("neighbor_indices_", 4, 4, 4),
new_indices_("new_indices_", nmax_pool_),
from_to_indices_("from_to_indices_", nmax_pool_ + 1),
recv_neighbor_index_("recv_neighbor_index_", nmax_pool_),
recv_buffer_index_("recv_buffer_index_", nmax_pool_),
scratch_a_("scratch_a_", nmax_pool_), scratch_b_("scratch_b_", nmax_pool_),
num_particles_to_send_("num_particles_to_send_", NMAX_NEIGHBORS),
cell_sorted_("cell_sorted_", nmax_pool_), mpiStatus(true) {
PARTHENON_REQUIRE_THROWS(typeid(Coordinates_t) == typeid(UniformCartesian),
"SwarmDeviceContext only supports a uniform Cartesian mesh!");

uid_ = get_uid_(label_);

// Add default swarm fields
Add(swarm_position::x::name(), Metadata({Metadata::Real}));
Add(swarm_position::y::name(), Metadata({Metadata::Real}));
Add(swarm_position::z::name(), Metadata({Metadata::Real}));

// Initialize index metadata
num_active_ = 0;
max_active_index_ = inactive_max_active_index;

// TODO(BRR) Do this in a device kernel?
auto mask_h = Kokkos::create_mirror_view(HostMemSpace(), mask_);
auto marked_for_removal_h =
Kokkos::create_mirror_view(HostMemSpace(), marked_for_removal_);

for (int n = 0; n < nmax_pool_; n++) {
mask_h(n) = false;
marked_for_removal_h(n) = false;
free_indices_.push_back(n);
}

Kokkos::deep_copy(mask_, mask_h);
Kokkos::deep_copy(marked_for_removal_, marked_for_removal_h);
UpdateEmptyIndices();
}

void Swarm::Add(const std::vector<std::string> &label_array, const Metadata &metadata) {
Expand Down Expand Up @@ -196,25 +189,24 @@ void Swarm::Remove(const std::string &label) {
}
}

void Swarm::setPoolMax(const std::int64_t nmax_pool) {
void Swarm::SetPoolMax(const std::int64_t nmax_pool) {
PARTHENON_REQUIRE(nmax_pool > nmax_pool_, "Must request larger pool size!");
std::int64_t n_new_begin = nmax_pool_;
std::int64_t n_new = nmax_pool - nmax_pool_;

auto pmb = GetBlockPointer();
auto pm = pmb->pmy_mesh;

for (std::int64_t n = 0; n < n_new; n++) {
free_indices_.push_back(n + n_new_begin);
}

// Rely on Kokkos setting the newly added values to false for these arrays
Kokkos::resize(mask_, nmax_pool);
Kokkos::resize(marked_for_removal_, nmax_pool);
Kokkos::resize(empty_indices_, nmax_pool);
Kokkos::resize(new_indices_, nmax_pool);
Kokkos::resize(from_to_indices_, nmax_pool + 1);
Kokkos::resize(recv_neighbor_index_, nmax_pool);
Kokkos::resize(recv_buffer_index_, nmax_pool);
Kokkos::resize(scratch_a_, nmax_pool);
Kokkos::resize(scratch_b_, nmax_pool);

pmb->LogMemUsage(2 * n_new * sizeof(bool));

Kokkos::resize(cell_sorted_, nmax_pool);
Expand All @@ -240,7 +232,10 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {

nmax_pool_ = nmax_pool;

// Eliminate any cached SwarmPacks, as they will need to be rebuilt following setPoolMax
// Populate new empty indices
UpdateEmptyIndices();

// Eliminate any cached SwarmPacks, as they will need to be rebuilt following SetPoolMax
pmb->meshblock_data.Get()->ClearSwarmCaches();
pm->mesh_data.Get("base")->ClearSwarmCaches();
for (auto &partition : pm->GetDefaultBlockPartitions()) {
Expand All @@ -251,129 +246,153 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {
NewParticlesContext Swarm::AddEmptyParticles(const int num_to_add) {
PARTHENON_DEBUG_REQUIRE(num_to_add >= 0, "Cannot add negative numbers of particles!");

auto pmb = GetBlockPointer();

if (num_to_add > 0) {
while (free_indices_.size() < num_to_add) {
increasePoolMax();
while (nmax_pool_ - num_active_ < num_to_add) {
IncreasePoolMax();
}

// TODO(BRR) Use par_scan on device rather than do this on host
auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);

auto block_index_h = block_index_.GetHostMirrorAndCopy();
auto &new_indices = new_indices_;
auto &empty_indices = empty_indices_;
auto &mask = mask_;

auto free_index = free_indices_.begin();
int max_new_active_index = 0;
parthenon::par_reduce(
PARTHENON_AUTO_LABEL, 0, num_to_add - 1,
KOKKOS_LAMBDA(const int n, int &max_ind) {
new_indices(n) = empty_indices(n);
mask(new_indices(n)) = true;

auto new_indices_h = new_indices_.GetHostMirror();

// Don't bother sanitizing the memory
for (int n = 0; n < num_to_add; n++) {
mask_h(*free_index) = true;
block_index_h(*free_index) = this_block_;
max_active_index_ = std::max<int>(max_active_index_, *free_index);
new_indices_h(n) = *free_index;

free_index = free_indices_.erase(free_index);
}
// Record vote for max active index
max_ind = new_indices(n);
},
Kokkos::Max<int>(max_new_active_index));

new_indices_.DeepCopy(new_indices_h);
// Update max active index if necessary
max_active_index_ = std::max(max_active_index_, max_new_active_index);

new_indices_max_idx_ = num_to_add - 1;
num_active_ += num_to_add;

Kokkos::deep_copy(mask_, mask_h);
block_index_.DeepCopy(block_index_h);
new_indices_max_idx_ = num_to_add - 1;
UpdateEmptyIndices();
} else {
new_indices_max_idx_ = -1;
}

// Create and return NewParticlesContext
return NewParticlesContext(new_indices_max_idx_, new_indices_);
}

// Updates the empty_indices_ array so the first N elements contain an ascending list of
// indices into empty elements of the swarm pool, where N is the number of empty indices
void Swarm::UpdateEmptyIndices() {
auto &mask = mask_;
auto &empty_indices = empty_indices_;

// Associate scratch memory
auto &empty_indices_scan = scratch_a_;

// Calculate prefix sum of empty indices
Kokkos::parallel_scan(
brryan marked this conversation as resolved.
Show resolved Hide resolved
"Set empty indices prefix sum", nmax_pool_,
KOKKOS_LAMBDA(const int n, int &update, const bool &final) {
const int val = !mask(n);
if (val) {
update += 1;
}

if (final) {
empty_indices_scan(n) = update;
}
});

// Update list of empty indices such that it is contiguous and in ascending order
parthenon::par_for(
PARTHENON_AUTO_LABEL, 0, nmax_pool_ - 1, KOKKOS_LAMBDA(const int n) {
if (!mask(n)) {
empty_indices(empty_indices_scan(n) - 1) = n;
}
});
}

// No active particles: nmax_active_index = inactive_max_active_index (= -1)
// No particles removed: nmax_active_index unchanged
// Particles removed: nmax_active_index is new max active index
void Swarm::RemoveMarkedParticles() {
// TODO(BRR) Use par_scan to do this on device rather than host
auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
auto marked_for_removal_h =
Kokkos::create_mirror_view_and_copy(HostMemSpace(), marked_for_removal_);

// loop backwards to keep free_indices_ updated correctly
for (int n = max_active_index_; n >= 0; n--) {
if (mask_h(n)) {
if (marked_for_removal_h(n)) {
mask_h(n) = false;
free_indices_.push_front(n);
num_active_ -= 1;
if (n == max_active_index_) {
max_active_index_ -= 1;
int &max_active_index = max_active_index_;

auto &mask = mask_;
auto &marked_for_removal = marked_for_removal_;

// Update mask, count number of removed particles
int num_removed = 0;
parthenon::par_reduce(
PARTHENON_AUTO_LABEL, 0, max_active_index,
brryan marked this conversation as resolved.
Show resolved Hide resolved
KOKKOS_LAMBDA(const int n, int &removed) {
if (mask(n)) {
if (marked_for_removal(n)) {
mask(n) = false;
marked_for_removal(n) = false;
removed += 1;
}
}
marked_for_removal_h(n) = false;
}
}
}
},
Kokkos::Sum<int>(num_removed));

Kokkos::deep_copy(mask_, mask_h);
Kokkos::deep_copy(marked_for_removal_, marked_for_removal_h);
num_active_ -= num_removed;

UpdateEmptyIndices();
}

void Swarm::Defrag() {
if (GetNumActive() == 0) {
return;
}
// TODO(BRR) Could this algorithm be more efficient? Does it matter?
// Add 1 to convert max index to max number
std::int64_t num_free = (max_active_index_ + 1) - num_active_;
auto pmb = GetBlockPointer();

auto from_to_indices_h = from_to_indices_.GetHostMirror();
// Associate scratch memory
auto &scan_scratch_toread = scratch_a_;
auto &map = scratch_b_;

auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);

for (int n = 0; n <= max_active_index_; n++) {
from_to_indices_h(n) = unset_index_;
}

std::list<int> new_free_indices;

free_indices_.sort();

int index = max_active_index_;
int num_to_move = std::min<int>(num_free, num_active_);
for (int n = 0; n < num_to_move; n++) {
while (mask_h(index) == false) {
index--;
}
int index_to_move_from = index;
index--;

// Below this number "moved" particles should actually stay in place
if (index_to_move_from < num_active_) {
break;
}
int index_to_move_to = free_indices_.front();
free_indices_.pop_front();
new_free_indices.push_back(index_to_move_from);
from_to_indices_h(index_to_move_from) = index_to_move_to;
}
auto &mask = mask_;

// TODO(BRR) Not all these sorts may be necessary
new_free_indices.sort();
free_indices_.merge(new_free_indices);
const int &num_active = num_active_;
Kokkos::parallel_scan(
"Set empty indices prefix sum", nmax_pool_ - num_active_,
KOKKOS_LAMBDA(const int nn, int &update, const bool &final) {
const int n = nn + num_active;
const int val = mask(n);
if (val) {
update += 1;
}
if (final) scan_scratch_toread(n) = update;
});

from_to_indices_.DeepCopy(from_to_indices_h);
parthenon::par_for(
PARTHENON_AUTO_LABEL, 0, nmax_pool_ - 1, KOKKOS_LAMBDA(const int n) {
if (n >= num_active) {
if (mask(n)) {
map(scan_scratch_toread(n) - 1) = n;
}
mask(n) = false;
}
});

auto from_to_indices = from_to_indices_;
// Reuse scratch memory
auto &scan_scratch_towrite = scan_scratch_toread;

auto &mask = mask_;
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if (from_to_indices(n) >= 0) {
mask(from_to_indices(n)) = mask(n);
mask(n) = false;
// Update list of empty indices
Kokkos::parallel_scan(
"Set empty indices prefix sum", num_active_,
KOKKOS_LAMBDA(const int n, int &update, const bool &final) {
const int val = !mask(n);
if (val) {
update += 1;
}
if (final) scan_scratch_towrite(n) = update;
});

// Get all dynamical variables in swarm
auto &int_vector = std::get<getType<int>()>(vectors_);
auto &real_vector = std::get<getType<Real>()>(vectors_);
PackIndexMap real_imap;
Expand All @@ -387,15 +406,19 @@ void Swarm::Defrag() {
const int realPackDim = vreal.GetDim(2);
const int intPackDim = vint.GetDim(2);

pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if (from_to_indices(n) >= 0) {
// Loop over only the active number of particles, and if mask is empty, copy in particle
// using address from prefix sum
parthenon::par_for(
PARTHENON_AUTO_LABEL, 0, num_active_ - 1, KOKKOS_LAMBDA(const int n) {
if (!mask(n)) {
const int nread = map(scan_scratch_towrite(n) - 1);
for (int vidx = 0; vidx < realPackDim; vidx++) {
vreal(vidx, from_to_indices(n)) = vreal(vidx, n);
vreal(vidx, n) = vreal(vidx, nread);
}
for (int vidx = 0; vidx < intPackDim; vidx++) {
vint(vidx, from_to_indices(n)) = vint(vidx, n);
vint(vidx, n) = vint(vidx, nread);
}
mask(n) = true;
}
});

Expand Down
Loading
Loading