From 433b63cd84173ebd073cb7e568e0c63b1bf78c51 Mon Sep 17 00:00:00 2001 From: "Alex R. Long" Date: Thu, 24 Oct 2024 12:26:33 -0600 Subject: [PATCH] Consolidate buffer packing functions with less atomics + Fold in buffer sizing to the load buffer function + Use a sort and a discontinuity check instead of an atomic_fetch_add inside of LoadBuffer_ to get particle index into buffer + Reduce transcendental functions call in particle sourcing in particle example --- example/particles/particles.cpp | 30 ++++---- src/interface/swarm_comms.cpp | 131 ++++++++++++++++++++++++-------- 2 files changed, 114 insertions(+), 47 deletions(-) diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp index a01a121f75fa..e9e0c1a30363 100644 --- a/example/particles/particles.cpp +++ b/example/particles/particles.cpp @@ -288,17 +288,18 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { } while (r > 0.5); // Randomly sample direction perpendicular to origin - Real theta = acos(2. * rng_gen.drand() - 1.); - Real phi = 2. * M_PI * rng_gen.drand(); - v(0, n) = sin(theta) * cos(phi); - v(1, n) = sin(theta) * sin(phi); - v(2, n) = cos(theta); + const Real mu = 2.0 * rng_gen.drand() - 1.0; + const Real phi = 2. * M_PI * rng_gen.drand(); + const Real stheta = std::sqrt(1.0 - mu * mu); + v(0, n) = stheta * cos(phi); + v(1, n) = stheta * sin(phi); + v(2, n) = mu; // Project v onto plane normal to sphere Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n); - Real NdN = r * r; - v(0, n) = v(0, n) - vdN / NdN * x(n); - v(1, n) = v(1, n) - vdN / NdN * y(n); - v(2, n) = v(2, n) - vdN / NdN * z(n); + Real inverse_NdN = r * r; + v(0, n) = v(0, n) - vdN * inverse_NdN * x(n); + v(1, n) = v(1, n) - vdN * inverse_NdN * y(n); + v(2, n) = v(2, n) - vdN * inverse_NdN * z(n); // Normalize Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n)); @@ -335,11 +336,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { z(n) = minx_k + nx_k * dx_k * rng_gen.drand(); // Randomly sample direction on the unit sphere, fixing speed - Real theta = acos(2. * rng_gen.drand() - 1.); - Real phi = 2. * M_PI * rng_gen.drand(); - v(0, n) = vel * sin(theta) * cos(phi); - v(1, n) = vel * sin(theta) * sin(phi); - v(2, n) = vel * cos(theta); + const Real mu = 2.0 * rng_gen.drand() - 1.0; + const Real phi = 2. * M_PI * rng_gen.drand(); + const Real stheta = std::sqrt(1.0 - mu * mu); + v(0, n) = vel * stheta * cos(phi); + v(1, n) = vel * stheta * sin(phi); + v(2, n) = vel * mu; // Create particles at the beginning of the timestep t(n) = t0; diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp index 3ee4e2af7512..107f0e2f1ccf 100644 --- a/src/interface/swarm_comms.cpp +++ b/src/interface/swarm_comms.cpp @@ -221,7 +221,11 @@ void Swarm::LoadBuffers_() { auto swarm_d = GetDeviceContext(); auto pmb = GetBlockPointer(); const int particle_size = GetParticleDataSize(); + vbswarm->particle_size = particle_size; const int nneighbor = pmb->neighbors.size(); + // Fence to make sure particles aren't currently being transported locally + // TODO(BRR) do this operation on device. + pmb->exec_space.fence(); auto &int_vector = std::get()>(vectors_); auto &real_vector = std::get()>(vectors_); @@ -236,42 +240,102 @@ void Swarm::LoadBuffers_() { auto &y = Get(swarm_position::y::name()).Get(); auto &z = Get(swarm_position::z::name()).Get(); - // Zero buffer index counters - auto &buffer_counters = buffer_counters_; - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1, - KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; }); + if(max_active_index_ >= 0) { + // Make an n particle sized array of index, buffer pairs (with SwarmKey struct) + ParArray1D buffer_sorted("buffer_sorted", max_active_index_+1); + ParArray1D buffer_start("buffer_start", nneighbor); - auto &bdvar = vbswarm->bd_var_; - auto neighbor_buffer_index = neighbor_buffer_index_; - // Loop over active particles and use atomic operations to find indices into buffers if - // this particle will be sent. - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { - if (swarm_d.IsActive(n)) { - bool on_current_mesh_block = true; - const int m = - swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block); - const int bufid = neighbor_buffer_index(m); - - if (m >= 0) { - const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1); - int buffer_index = bid * particle_size; - swarm_d.MarkParticleForRemoval(n); - for (int i = 0; i < realPackDim; i++) { - bdvar.send[bufid](buffer_index) = vreal(i, n); - buffer_index++; - } - for (int i = 0; i < intPackDim; i++) { - bdvar.send[bufid](buffer_index) = static_cast(vint(i, n)); - buffer_index++; + pmb->par_for( + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + if(swarm_d.IsActive(n)) { + bool on_current_mesh_block = true; + const int m = + swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block); + buffer_sorted(n) = SwarmKey(m, n); + } + else { + buffer_sorted(n) = SwarmKey(-1, n); + } + }); + + + // sort by buffer index + sort(buffer_sorted, SwarmKeyComparator(), 0, max_active_index_); + + // use discontinuity check to determine start of a buffer in buffer_sorted array + auto &num_particles_to_send = num_particles_to_send_; + auto max_active_index = max_active_index_; + + // Zero out number of particles to send before accumulating + pmb->par_for( + PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1, + KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; }); + + pmb->par_for( + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + auto m = buffer_sorted(n).cell_idx_1d_; + // start checks (used for index of particle in buffer) + if (m >= 0 && n ==0 ) { + buffer_start(m) = 0; + } + else if (m >= 0 && m != buffer_sorted(n-1).cell_idx_1d_) { + buffer_start(m) = n; + } + + // end checks (used to to size particle buffers) + if (m >= 0 && n == max_active_index ) { + num_particles_to_send(m) = n +1; + } + else if (m >= 0 && m != buffer_sorted(n+1).cell_idx_1d_ ) { + num_particles_to_send(m) = n +1; + } + }); + + // copy values back to host for buffer sizing + auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy(); + auto buffer_start_h = buffer_start.GetHostMirrorAndCopy(); + + // Resize send buffers if too small + for (int n = 0; n < pmb->neighbors.size(); n++) { + num_particles_to_send_h(n) -= buffer_start_h(n); + const int bufid = pmb->neighbors[n].bufid; + auto sendbuf = vbswarm->bd_var_.send[bufid]; + if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) { + sendbuf = BufArray1D("Buffer", num_particles_to_send_h(n) * particle_size); + vbswarm->bd_var_.send[bufid] = sendbuf; + } + vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size; + } + + auto &bdvar = vbswarm->bd_var_; + auto neighbor_buffer_index = neighbor_buffer_index_; + // Loop over active particles buffer_sorted, use index n and buffer_start to + /// get index in buffer m, pack that particle in buffer + pmb->par_for( + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + auto p_index = buffer_sorted(n).swarm_idx_; + if (swarm_d.IsActive(p_index)) { + const int m = buffer_sorted(n).cell_idx_1d_; + const int bufid = neighbor_buffer_index(m); + if (m >= 0) { + const int bid = n - buffer_start[m]; + int buffer_index = bid * particle_size; + swarm_d.MarkParticleForRemoval(p_index); + for (int i = 0; i < realPackDim; i++) { + bdvar.send[bufid](buffer_index) = vreal(i, p_index); + buffer_index++; + } + for (int i = 0; i < intPackDim; i++) { + bdvar.send[bufid](buffer_index) = static_cast(vint(i, p_index)); + buffer_index++; + } } } - } - }); + }); - // Remove particles that were loaded to send to another block from this block - RemoveMarkedParticles(); + // Remove particles that were loaded to send to another block from this block + RemoveMarkedParticles(); + } } void Swarm::Send(BoundaryCommSubset phase) { @@ -280,7 +344,7 @@ void Swarm::Send(BoundaryCommSubset phase) { auto swarm_d = GetDeviceContext(); // Query particles for those to be sent - CountParticlesToSend_(); + //CountParticlesToSend_(); // Prepare buffers for send operations LoadBuffers_(); @@ -443,3 +507,4 @@ void Swarm::AllocateComms(std::weak_ptr wpmb) { } } // namespace parthenon +