Consolidate buffer packing functions with less atomics

+ Fold in buffer sizing to the load buffer function + Use a sort and a discontinuity check instead of an atomic_fetch_add inside of LoadBuffer_ to get particle index into buffer + Reduce transcendental functions call in particle sourcing in particle example
parthenon-hpc-lab · Oct 24, 2024 · 433b63c · 433b63c
1 parent 79d5d30
commit 433b63c
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 47 deletions.
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
@@ -288,17 +288,18 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
           } while (r > 0.5);
 
           // Randomly sample direction perpendicular to origin
-          Real theta = acos(2. * rng_gen.drand() - 1.);
-          Real phi = 2. * M_PI * rng_gen.drand();
-          v(0, n) = sin(theta) * cos(phi);
-          v(1, n) = sin(theta) * sin(phi);
-          v(2, n) = cos(theta);
+          const Real mu = 2.0 * rng_gen.drand() - 1.0;
+          const Real phi = 2. * M_PI * rng_gen.drand();
+          const Real stheta = std::sqrt(1.0 - mu * mu);
+          v(0, n) = stheta * cos(phi);
+          v(1, n) = stheta * sin(phi);
+          v(2, n) = mu;
           // Project v onto plane normal to sphere
           Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
-          Real NdN = r * r;
-          v(0, n) = v(0, n) - vdN / NdN * x(n);
-          v(1, n) = v(1, n) - vdN / NdN * y(n);
-          v(2, n) = v(2, n) - vdN / NdN * z(n);
+          Real inverse_NdN = r * r;
+          v(0, n) = v(0, n) - vdN * inverse_NdN * x(n);
+          v(1, n) = v(1, n) - vdN * inverse_NdN * y(n);
+          v(2, n) = v(2, n) - vdN * inverse_NdN * z(n);
 
           // Normalize
           Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
@@ -335,11 +336,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
           z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
 
           // Randomly sample direction on the unit sphere, fixing speed
-          Real theta = acos(2. * rng_gen.drand() - 1.);
-          Real phi = 2. * M_PI * rng_gen.drand();
-          v(0, n) = vel * sin(theta) * cos(phi);
-          v(1, n) = vel * sin(theta) * sin(phi);
-          v(2, n) = vel * cos(theta);
+          const Real mu = 2.0 * rng_gen.drand() - 1.0;
+          const Real phi = 2. * M_PI * rng_gen.drand();
+          const Real stheta = std::sqrt(1.0 - mu * mu);
+          v(0, n) = vel * stheta * cos(phi);
+          v(1, n) = vel * stheta * sin(phi);
+          v(2, n) = vel * mu;
 
           // Create particles at the beginning of the timestep
           t(n) = t0;

diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
@@ -221,7 +221,11 @@ void Swarm::LoadBuffers_() {
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
   const int particle_size = GetParticleDataSize();
+  vbswarm->particle_size = particle_size;
   const int nneighbor = pmb->neighbors.size();
+  // Fence to make sure particles aren't currently being transported locally
+  // TODO(BRR) do this operation on device.
+  pmb->exec_space.fence();
 
   auto &int_vector = std::get<getType<int>()>(vectors_);
   auto &real_vector = std::get<getType<Real>()>(vectors_);
@@ -236,42 +240,102 @@ void Swarm::LoadBuffers_() {
   auto &y = Get<Real>(swarm_position::y::name()).Get();
   auto &z = Get<Real>(swarm_position::z::name()).Get();
 
-  // Zero buffer index counters
-  auto &buffer_counters = buffer_counters_;
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
-      KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; });
+  if(max_active_index_ >= 0) {
+    // Make an n particle sized array of index, buffer pairs (with SwarmKey struct)
+    ParArray1D<SwarmKey> buffer_sorted("buffer_sorted", max_active_index_+1);
+    ParArray1D<int> buffer_start("buffer_start", nneighbor);
 
-  auto &bdvar = vbswarm->bd_var_;
-  auto neighbor_buffer_index = neighbor_buffer_index_;
-  // Loop over active particles and use atomic operations to find indices into buffers if
-  // this particle will be sent.
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
-        if (swarm_d.IsActive(n)) {
-          bool on_current_mesh_block = true;
-          const int m =
-              swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
-          const int bufid = neighbor_buffer_index(m);
-
-          if (m >= 0) {
-            const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1);
-            int buffer_index = bid * particle_size;
-            swarm_d.MarkParticleForRemoval(n);
-            for (int i = 0; i < realPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = vreal(i, n);
-              buffer_index++;
-            }
-            for (int i = 0; i < intPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, n));
-              buffer_index++;
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          if(swarm_d.IsActive(n)) {
+            bool on_current_mesh_block = true;
+            const int m =
+            swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
+            buffer_sorted(n) = SwarmKey(m, n);
+          }
+          else {
+            buffer_sorted(n) = SwarmKey(-1, n);
+          }
+        });
+
+
+    // sort by buffer index
+    sort(buffer_sorted, SwarmKeyComparator(), 0, max_active_index_);
+
+    // use discontinuity check to determine start of a buffer in buffer_sorted array
+    auto &num_particles_to_send = num_particles_to_send_;
+    auto max_active_index = max_active_index_;
+
+    // Zero out number of particles to send before accumulating
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
+        KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; });
+
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          auto m = buffer_sorted(n).cell_idx_1d_;
+          // start checks (used for index of particle in buffer)
+          if (m >= 0 && n ==0 ) {
+            buffer_start(m) = 0;
+          }
+          else if (m >= 0 && m != buffer_sorted(n-1).cell_idx_1d_) {
+            buffer_start(m) = n;
+          }
+
+          // end checks (used to to size particle buffers)
+          if (m >= 0 && n == max_active_index ) {
+            num_particles_to_send(m) = n +1;
+          }
+          else if (m >= 0 && m != buffer_sorted(n+1).cell_idx_1d_ ) {
+            num_particles_to_send(m) = n +1;
+          }
+         });
+
+    // copy values back to host for buffer sizing
+    auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
+    auto buffer_start_h = buffer_start.GetHostMirrorAndCopy();
+
+    // Resize send buffers if too small
+    for (int n = 0; n < pmb->neighbors.size(); n++) {
+      num_particles_to_send_h(n) -= buffer_start_h(n);
+      const int bufid = pmb->neighbors[n].bufid;
+      auto sendbuf = vbswarm->bd_var_.send[bufid];
+      if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
+        sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
+        vbswarm->bd_var_.send[bufid] = sendbuf;
+      }
+      vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
+    }
+
+    auto &bdvar = vbswarm->bd_var_;
+    auto neighbor_buffer_index = neighbor_buffer_index_;
+    // Loop over active particles buffer_sorted, use index n and buffer_start to
+    /// get index in buffer m, pack that particle in buffer
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          auto p_index = buffer_sorted(n).swarm_idx_;
+          if (swarm_d.IsActive(p_index)) {
+            const int m = buffer_sorted(n).cell_idx_1d_;
+            const int bufid = neighbor_buffer_index(m);
+            if (m >= 0) {
+              const int bid = n - buffer_start[m];
+              int buffer_index = bid * particle_size;
+              swarm_d.MarkParticleForRemoval(p_index);
+              for (int i = 0; i < realPackDim; i++) {
+                bdvar.send[bufid](buffer_index) = vreal(i, p_index);
+                buffer_index++;
+              }
+              for (int i = 0; i < intPackDim; i++) {
+                bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, p_index));
+                buffer_index++;
+              }
             }
           }
-        }
-      });
+        });
 
-  // Remove particles that were loaded to send to another block from this block
-  RemoveMarkedParticles();
+    // Remove particles that were loaded to send to another block from this block
+    RemoveMarkedParticles();
+  }
 }
 
 void Swarm::Send(BoundaryCommSubset phase) {
@@ -280,7 +344,7 @@ void Swarm::Send(BoundaryCommSubset phase) {
   auto swarm_d = GetDeviceContext();
 
   // Query particles for those to be sent
-  CountParticlesToSend_();
+  //CountParticlesToSend_();
 
   // Prepare buffers for send operations
   LoadBuffers_();
@@ -443,3 +507,4 @@ void Swarm::AllocateComms(std::weak_ptr<MeshBlock> wpmb) {
 }
 
 } // namespace parthenon
+