Skip to content

Commit

Permalink
Consolidate buffer packing functions with less atomics
Browse files Browse the repository at this point in the history
+ Fold in buffer sizing to the load buffer function
+ Use a sort and a discontinuity check instead of an
atomic_fetch_add inside of LoadBuffer_ to get particle
index into buffer
+ Reduce transcendental functions call in particle sourcing in
particle example
  • Loading branch information
alexrlongne committed Oct 24, 2024
1 parent 79d5d30 commit 433b63c
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 47 deletions.
30 changes: 16 additions & 14 deletions example/particles/particles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,17 +288,18 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
} while (r > 0.5);

// Randomly sample direction perpendicular to origin
Real theta = acos(2. * rng_gen.drand() - 1.);
Real phi = 2. * M_PI * rng_gen.drand();
v(0, n) = sin(theta) * cos(phi);
v(1, n) = sin(theta) * sin(phi);
v(2, n) = cos(theta);
const Real mu = 2.0 * rng_gen.drand() - 1.0;
const Real phi = 2. * M_PI * rng_gen.drand();
const Real stheta = std::sqrt(1.0 - mu * mu);
v(0, n) = stheta * cos(phi);
v(1, n) = stheta * sin(phi);
v(2, n) = mu;
// Project v onto plane normal to sphere
Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
Real NdN = r * r;
v(0, n) = v(0, n) - vdN / NdN * x(n);
v(1, n) = v(1, n) - vdN / NdN * y(n);
v(2, n) = v(2, n) - vdN / NdN * z(n);
Real inverse_NdN = r * r;
v(0, n) = v(0, n) - vdN * inverse_NdN * x(n);
v(1, n) = v(1, n) - vdN * inverse_NdN * y(n);
v(2, n) = v(2, n) - vdN * inverse_NdN * z(n);

// Normalize
Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
Expand Down Expand Up @@ -335,11 +336,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
z(n) = minx_k + nx_k * dx_k * rng_gen.drand();

// Randomly sample direction on the unit sphere, fixing speed
Real theta = acos(2. * rng_gen.drand() - 1.);
Real phi = 2. * M_PI * rng_gen.drand();
v(0, n) = vel * sin(theta) * cos(phi);
v(1, n) = vel * sin(theta) * sin(phi);
v(2, n) = vel * cos(theta);
const Real mu = 2.0 * rng_gen.drand() - 1.0;
const Real phi = 2. * M_PI * rng_gen.drand();
const Real stheta = std::sqrt(1.0 - mu * mu);
v(0, n) = vel * stheta * cos(phi);
v(1, n) = vel * stheta * sin(phi);
v(2, n) = vel * mu;

// Create particles at the beginning of the timestep
t(n) = t0;
Expand Down
131 changes: 98 additions & 33 deletions src/interface/swarm_comms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,11 @@ void Swarm::LoadBuffers_() {
auto swarm_d = GetDeviceContext();
auto pmb = GetBlockPointer();
const int particle_size = GetParticleDataSize();
vbswarm->particle_size = particle_size;
const int nneighbor = pmb->neighbors.size();
// Fence to make sure particles aren't currently being transported locally
// TODO(BRR) do this operation on device.
pmb->exec_space.fence();

auto &int_vector = std::get<getType<int>()>(vectors_);
auto &real_vector = std::get<getType<Real>()>(vectors_);
Expand All @@ -236,42 +240,102 @@ void Swarm::LoadBuffers_() {
auto &y = Get<Real>(swarm_position::y::name()).Get();
auto &z = Get<Real>(swarm_position::z::name()).Get();

// Zero buffer index counters
auto &buffer_counters = buffer_counters_;
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; });
if(max_active_index_ >= 0) {
// Make an n particle sized array of index, buffer pairs (with SwarmKey struct)
ParArray1D<SwarmKey> buffer_sorted("buffer_sorted", max_active_index_+1);
ParArray1D<int> buffer_start("buffer_start", nneighbor);

auto &bdvar = vbswarm->bd_var_;
auto neighbor_buffer_index = neighbor_buffer_index_;
// Loop over active particles and use atomic operations to find indices into buffers if
// this particle will be sent.
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if (swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
const int m =
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
const int bufid = neighbor_buffer_index(m);

if (m >= 0) {
const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1);
int buffer_index = bid * particle_size;
swarm_d.MarkParticleForRemoval(n);
for (int i = 0; i < realPackDim; i++) {
bdvar.send[bufid](buffer_index) = vreal(i, n);
buffer_index++;
}
for (int i = 0; i < intPackDim; i++) {
bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, n));
buffer_index++;
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if(swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
const int m =
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
buffer_sorted(n) = SwarmKey(m, n);
}
else {
buffer_sorted(n) = SwarmKey(-1, n);
}
});


// sort by buffer index
sort(buffer_sorted, SwarmKeyComparator(), 0, max_active_index_);

// use discontinuity check to determine start of a buffer in buffer_sorted array
auto &num_particles_to_send = num_particles_to_send_;
auto max_active_index = max_active_index_;

// Zero out number of particles to send before accumulating
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; });

pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
auto m = buffer_sorted(n).cell_idx_1d_;
// start checks (used for index of particle in buffer)
if (m >= 0 && n ==0 ) {
buffer_start(m) = 0;
}
else if (m >= 0 && m != buffer_sorted(n-1).cell_idx_1d_) {
buffer_start(m) = n;
}

// end checks (used to to size particle buffers)
if (m >= 0 && n == max_active_index ) {
num_particles_to_send(m) = n +1;
}
else if (m >= 0 && m != buffer_sorted(n+1).cell_idx_1d_ ) {
num_particles_to_send(m) = n +1;
}
});

// copy values back to host for buffer sizing
auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
auto buffer_start_h = buffer_start.GetHostMirrorAndCopy();

// Resize send buffers if too small
for (int n = 0; n < pmb->neighbors.size(); n++) {
num_particles_to_send_h(n) -= buffer_start_h(n);
const int bufid = pmb->neighbors[n].bufid;
auto sendbuf = vbswarm->bd_var_.send[bufid];
if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
vbswarm->bd_var_.send[bufid] = sendbuf;
}
vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
}

auto &bdvar = vbswarm->bd_var_;
auto neighbor_buffer_index = neighbor_buffer_index_;
// Loop over active particles buffer_sorted, use index n and buffer_start to
/// get index in buffer m, pack that particle in buffer
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
auto p_index = buffer_sorted(n).swarm_idx_;
if (swarm_d.IsActive(p_index)) {
const int m = buffer_sorted(n).cell_idx_1d_;
const int bufid = neighbor_buffer_index(m);
if (m >= 0) {
const int bid = n - buffer_start[m];
int buffer_index = bid * particle_size;
swarm_d.MarkParticleForRemoval(p_index);
for (int i = 0; i < realPackDim; i++) {
bdvar.send[bufid](buffer_index) = vreal(i, p_index);
buffer_index++;
}
for (int i = 0; i < intPackDim; i++) {
bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, p_index));
buffer_index++;
}
}
}
}
});
});

// Remove particles that were loaded to send to another block from this block
RemoveMarkedParticles();
// Remove particles that were loaded to send to another block from this block
RemoveMarkedParticles();
}
}

void Swarm::Send(BoundaryCommSubset phase) {
Expand All @@ -280,7 +344,7 @@ void Swarm::Send(BoundaryCommSubset phase) {
auto swarm_d = GetDeviceContext();

// Query particles for those to be sent
CountParticlesToSend_();
//CountParticlesToSend_();

// Prepare buffers for send operations
LoadBuffers_();
Expand Down Expand Up @@ -443,3 +507,4 @@ void Swarm::AllocateComms(std::weak_ptr<MeshBlock> wpmb) {
}

} // namespace parthenon

0 comments on commit 433b63c

Please sign in to comment.