diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4f474633ce8e..af70ab73376b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@
 - [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts
 
 ### Fixed (not changing behavior/API/variables/...)
+- [[PR 1145]](https://github.com/parthenon-hpc-lab/parthenon/pull/1145) Fix remaining swarm D->H->D copies
 - [[PR 1150]](https://github.com/parthenon-hpc-lab/parthenon/pull/1150) Reduce memory consumption for buffer pool
 - [[PR 1146]](https://github.com/parthenon-hpc-lab/parthenon/pull/1146) Fix an issue outputting >4GB single variables per rank
 - [[PR 1152]](https://github.com/parthenon-hpc-lab/parthenon/pull/1152) Fix memory leak in task graph outputs related to `abi::__cxa_demangle`
diff --git a/example/particles/parthinput.particles b/example/particles/parthinput.particles
index bca9ec93292e..a79f774c6850 100644
--- a/example/particles/parthinput.particles
+++ b/example/particles/parthinput.particles
@@ -24,10 +24,8 @@ refinement = none
 nx1 = 16
 x1min = -0.5
 x1max = 0.5
-ix1_bc = user
-ox1_bc = user
-# ix1_bc = periodic # Optionally use periodic boundary conditions everywhere
-# ox1_bc = periodic
+ix1_bc = periodic
+ox1_bc = periodic
 
 nx2 = 16
 x2min = -0.5
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
index 8a8d47ef111a..3f99407ded54 100644
--- a/example/particles/particles.cpp
+++ b/example/particles/particles.cpp
@@ -340,8 +340,7 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
   return TaskStatus::complete;
 }
 
-TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator,
-                              const double t0) {
+TaskStatus TransportParticles(MeshBlock *pmb, const double t0, const double dt) {
   PARTHENON_INSTRUMENT
 
   auto swarm = pmb->meshblock_data.Get()->GetSwarmData()->Get("my_particles");
@@ -350,8 +349,6 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
 
   int max_active_index = swarm->GetMaxActiveIndex();
 
-  Real dt = integrator->dt;
-
   auto &t = swarm->Get<Real>("t").Get();
   auto &x = swarm->Get<Real>(swarm_position::x::name()).Get();
   auto &y = swarm->Get<Real>(swarm_position::y::name()).Get();
@@ -469,97 +466,31 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
 // Custom step function to allow for looping over MPI-related tasks until complete
 TaskListStatus ParticleDriver::Step() {
   TaskListStatus status;
-  integrator.dt = tm.dt;
+
+  PARTHENON_REQUIRE(integrator.nstages == 1,
+                    "Only first order time integration supported!");
 
   BlockList_t &blocks = pmesh->block_list;
   auto num_task_lists_executed_independently = blocks.size();
 
   // Create all the particles that will be created during the step
   status = MakeParticlesCreationTaskCollection().Execute();
+  PARTHENON_REQUIRE(status == TaskListStatus::complete,
+                    "ParticlesCreation task list failed!");
 
-  // Loop over repeated MPI calls until every particle is finished. This logic is
-  // required because long-distance particle pushes can lead to a large, unpredictable
-  // number of MPI sends and receives.
-  bool particles_update_done = false;
-  while (!particles_update_done) {
-    status = MakeParticlesUpdateTaskCollection().Execute();
-
-    particles_update_done = true;
-    for (auto &block : blocks) {
-      // TODO(BRR) Despite this "my_particles"-specific call, this function feels like it
-      // should be generalized
-      auto swarm = block->meshblock_data.Get()->GetSwarmData()->Get("my_particles");
-      if (!swarm->finished_transport) {
-        particles_update_done = false;
-      }
-    }
-  }
+  // Transport particles iteratively until all particles reach final time
+  status = IterativeTransport();
+  // status = MakeParticlesTransportTaskCollection().Execute();
+  PARTHENON_REQUIRE(status == TaskListStatus::complete,
+                    "IterativeTransport task list failed!");
 
   // Use a more traditional task list for predictable post-MPI evaluations.
   status = MakeFinalizationTaskCollection().Execute();
+  PARTHENON_REQUIRE(status == TaskListStatus::complete, "Finalization task list failed!");
 
   return status;
 }
 
-// TODO(BRR) This should really be in parthenon/src... but it can't just live in Swarm
-// because of the loop over blocks
-TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
-  PARTHENON_INSTRUMENT
-
-  int num_sent_local = 0;
-  for (auto &block : blocks) {
-    auto sc = block->meshblock_data.Get()->GetSwarmData();
-    auto swarm = sc->Get("my_particles");
-    swarm->finished_transport = false;
-    num_sent_local += swarm->num_particles_sent_;
-  }
-
-  int num_sent_global = num_sent_local; // potentially overwritten by following Allreduce
-#ifdef MPI_PARALLEL
-  for (auto &block : blocks) {
-    auto swarm = block->meshblock_data.Get()->GetSwarmData()->Get("my_particles");
-    for (int n = 0; n < block->neighbors.size(); n++) {
-      NeighborBlock &nb = block->neighbors[n];
-      // TODO(BRR) May want logic like this if we have non-blocking TaskRegions
-      // if (nb.snb.rank != Globals::my_rank) {
-      //  if (swarm->vbswarm->bd_var_.flag[nb.bufid] != BoundaryStatus::completed) {
-      //    return TaskStatus::incomplete;
-      //  }
-      //}
-
-      // TODO(BRR) May want to move this logic into a per-cycle initialization call
-      if (swarm->vbswarm->bd_var_.flag[nb.bufid] == BoundaryStatus::completed) {
-        swarm->vbswarm->bd_var_.req_send[nb.bufid] = MPI_REQUEST_NULL;
-      }
-    }
-  }
-
-  MPI_Allreduce(&num_sent_local, &num_sent_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-#endif // MPI_PARALLEL
-
-  if (num_sent_global == 0) {
-    for (auto &block : blocks) {
-      auto &pmb = block;
-      auto sc = pmb->meshblock_data.Get()->GetSwarmData();
-      auto swarm = sc->Get("my_particles");
-      swarm->finished_transport = true;
-    }
-  }
-
-  // Reset boundary statuses
-  for (auto &block : blocks) {
-    auto &pmb = block;
-    auto sc = pmb->meshblock_data.Get()->GetSwarmData();
-    auto swarm = sc->Get("my_particles");
-    for (int n = 0; n < pmb->neighbors.size(); n++) {
-      auto &nb = block->neighbors[n];
-      swarm->vbswarm->bd_var_.flag[nb.bufid] = BoundaryStatus::waiting;
-    }
-  }
-
-  return TaskStatus::complete;
-}
-
 TaskCollection ParticleDriver::MakeParticlesCreationTaskCollection() const {
   TaskCollection tc;
   TaskID none(0);
@@ -577,40 +508,93 @@ TaskCollection ParticleDriver::MakeParticlesCreationTaskCollection() const {
   return tc;
 }
 
-TaskCollection ParticleDriver::MakeParticlesUpdateTaskCollection() const {
+TaskStatus CountNumSent(const BlockList_t &blocks, const double tf_, bool *done) {
+  int num_unfinished = 0;
+  for (auto &block : blocks) {
+    auto sc = block->meshblock_data.Get()->GetSwarmData();
+    auto swarm = sc->Get("my_particles");
+    int max_active_index = swarm->GetMaxActiveIndex();
+
+    auto &t = swarm->Get<Real>("t").Get();
+
+    auto swarm_d = swarm->GetDeviceContext();
+
+    const auto &tf = tf_;
+
+    parthenon::par_reduce(
+        PARTHENON_AUTO_LABEL, 0, max_active_index,
+        KOKKOS_LAMBDA(const int n, int &num_unfinished) {
+          if (swarm_d.IsActive(n)) {
+            if (t(n) < tf) {
+              num_unfinished++;
+            }
+          }
+        },
+        Kokkos::Sum<int>(num_unfinished));
+  }
+
+#ifdef MPI_PARALLEL
+  MPI_Allreduce(MPI_IN_PLACE, &num_unfinished, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif // MPI_PARALLEL
+
+  if (num_unfinished > 0) {
+    *done = false;
+  } else {
+    *done = true;
+  }
+
+  return TaskStatus::complete;
+}
+
+TaskCollection ParticleDriver::IterativeTransportTaskCollection(bool *done) const {
   TaskCollection tc;
   TaskID none(0);
-  const double t0 = tm.time;
   const BlockList_t &blocks = pmesh->block_list;
+  const int nblocks = blocks.size();
+  const double t0 = tm.time;
+  const double dt = tm.dt;
 
-  auto num_task_lists_executed_independently = blocks.size();
-
-  TaskRegion &async_region0 = tc.AddRegion(num_task_lists_executed_independently);
-  for (int i = 0; i < blocks.size(); i++) {
+  TaskRegion &async_region = tc.AddRegion(nblocks);
+  for (int i = 0; i < nblocks; i++) {
     auto &pmb = blocks[i];
-
     auto &sc = pmb->meshblock_data.Get()->GetSwarmData();
+    auto &tl = async_region[i];
 
-    auto &tl = async_region0[i];
-
-    auto transport_particles =
-        tl.AddTask(none, TransportParticles, pmb.get(), &integrator, t0);
-
-    auto send = tl.AddTask(transport_particles, &SwarmContainer::Send, sc.get(),
-                           BoundaryCommSubset::all);
+    auto transport = tl.AddTask(none, TransportParticles, pmb.get(), t0, dt);
+    auto reset_comms =
+        tl.AddTask(transport, &SwarmContainer::ResetCommunication, sc.get());
+    auto send =
+        tl.AddTask(reset_comms, &SwarmContainer::Send, sc.get(), BoundaryCommSubset::all);
     auto receive =
         tl.AddTask(send, &SwarmContainer::Receive, sc.get(), BoundaryCommSubset::all);
   }
 
-  TaskRegion &sync_region0 = tc.AddRegion(1);
+  TaskRegion &sync_region = tc.AddRegion(1);
   {
-    auto &tl = sync_region0[0];
-    auto stop_comm = tl.AddTask(none, StopCommunicationMesh, blocks);
+    auto &tl = sync_region[0];
+    auto check_completion = tl.AddTask(none, CountNumSent, blocks, t0 + dt, done);
   }
 
   return tc;
 }
 
+// TODO(BRR) to be replaced by iterative tasklist machinery
+TaskListStatus ParticleDriver::IterativeTransport() const {
+  TaskListStatus status;
+  bool transport_done = false;
+  int n_transport_iter = 0;
+  int n_transport_iter_max = 1000;
+  while (!transport_done) {
+    status = IterativeTransportTaskCollection(&transport_done).Execute();
+
+    n_transport_iter++;
+    PARTHENON_REQUIRE(n_transport_iter < n_transport_iter_max,
+                      "Too many transport iterations!");
+  }
+
+  return status;
+}
+
 TaskCollection ParticleDriver::MakeFinalizationTaskCollection() const {
   TaskCollection tc;
   TaskID none(0);
diff --git a/example/particles/particles.hpp b/example/particles/particles.hpp
index 6d74d59dbbb8..705572675247 100644
--- a/example/particles/particles.hpp
+++ b/example/particles/particles.hpp
@@ -33,7 +33,9 @@ class ParticleDriver : public EvolutionDriver {
   ParticleDriver(ParameterInput *pin, ApplicationInput *app_in, Mesh *pm)
       : EvolutionDriver(pin, app_in, pm), integrator(pin) {}
   TaskCollection MakeParticlesCreationTaskCollection() const;
-  TaskCollection MakeParticlesUpdateTaskCollection() const;
+  TaskCollection MakeParticlesTransportTaskCollection() const;
+  TaskListStatus IterativeTransport() const;
+  TaskCollection IterativeTransportTaskCollection(bool *done) const;
   TaskCollection MakeFinalizationTaskCollection() const;
   TaskListStatus Step();
 
diff --git a/src/interface/state_descriptor.cpp b/src/interface/state_descriptor.cpp
index aa3fa5e6af27..af73259ff6aa 100644
--- a/src/interface/state_descriptor.cpp
+++ b/src/interface/state_descriptor.cpp
@@ -452,6 +452,13 @@ StateDescriptor::CreateResolvedStateDescriptor(Packages_t &packages) {
     field_tracker.CategorizeCollection(name, field_dict, &field_provider);
     swarm_tracker.CategorizeCollection(name, package->AllSwarms(), &swarm_provider);
 
+    if (!package->AllSwarms().empty() && !std::is_same<Real, double>::value) {
+      PARTHENON_WARN(
+          "Swarms always use Real precision, even for ParticleVariables containing "
+          "time data, while Parthenon time variables are fixed to double precision. This "
+          "may cause inaccurate comparisons with cycle beginning and end times.")
+    }
+
     // Add package registered boundary conditions
     for (int i = 0; i < 6; ++i)
       state->UserBoundaryFunctions[i].insert(state->UserBoundaryFunctions[i].end(),
diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp
index b6f1fb6fd991..8e715f1cb8db 100644
--- a/src/interface/swarm.cpp
+++ b/src/interface/swarm.cpp
@@ -65,38 +65,30 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {
 
 Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_pool_in)
     : label_(label), m_(metadata), nmax_pool_(nmax_pool_in), mask_("mask", nmax_pool_),
-      marked_for_removal_("mfr", nmax_pool_), block_index_("block_index_", nmax_pool_),
+      marked_for_removal_("mfr", nmax_pool_),
+      empty_indices_("empty_indices_", nmax_pool_),
+      block_index_("block_index_", nmax_pool_),
       neighbor_indices_("neighbor_indices_", 4, 4, 4),
-      new_indices_("new_indices_", nmax_pool_),
-      from_to_indices_("from_to_indices_", nmax_pool_ + 1),
-      recv_neighbor_index_("recv_neighbor_index_", nmax_pool_),
-      recv_buffer_index_("recv_buffer_index_", nmax_pool_),
+      new_indices_("new_indices_", nmax_pool_), scratch_a_("scratch_a_", nmax_pool_),
+      scratch_b_("scratch_b_", nmax_pool_),
       num_particles_to_send_("num_particles_to_send_", NMAX_NEIGHBORS),
+      buffer_counters_("buffer_counters_", NMAX_NEIGHBORS),
+      neighbor_received_particles_("neighbor_received_particles_", NMAX_NEIGHBORS),
       cell_sorted_("cell_sorted_", nmax_pool_), mpiStatus(true) {
   PARTHENON_REQUIRE_THROWS(typeid(Coordinates_t) == typeid(UniformCartesian),
                            "SwarmDeviceContext only supports a uniform Cartesian mesh!");
 
   uid_ = get_uid_(label_);
 
+  // Add default swarm fields
   Add(swarm_position::x::name(), Metadata({Metadata::Real}));
   Add(swarm_position::y::name(), Metadata({Metadata::Real}));
   Add(swarm_position::z::name(), Metadata({Metadata::Real}));
+
+  // Initialize index metadata
   num_active_ = 0;
   max_active_index_ = inactive_max_active_index;
-
-  // TODO(BRR) Do this in a device kernel?
-  auto mask_h = Kokkos::create_mirror_view(HostMemSpace(), mask_);
-  auto marked_for_removal_h =
-      Kokkos::create_mirror_view(HostMemSpace(), marked_for_removal_);
-
-  for (int n = 0; n < nmax_pool_; n++) {
-    mask_h(n) = false;
-    marked_for_removal_h(n) = false;
-    free_indices_.push_back(n);
-  }
-
-  Kokkos::deep_copy(mask_, mask_h);
-  Kokkos::deep_copy(marked_for_removal_, marked_for_removal_h);
+  UpdateEmptyIndices();
 }
 
 void Swarm::Add(const std::vector<std::string> &label_array, const Metadata &metadata) {
@@ -196,25 +188,21 @@ void Swarm::Remove(const std::string &label) {
   }
 }
 
-void Swarm::setPoolMax(const std::int64_t nmax_pool) {
+void Swarm::SetPoolMax(const std::int64_t nmax_pool) {
   PARTHENON_REQUIRE(nmax_pool > nmax_pool_, "Must request larger pool size!");
-  std::int64_t n_new_begin = nmax_pool_;
   std::int64_t n_new = nmax_pool - nmax_pool_;
 
   auto pmb = GetBlockPointer();
   auto pm = pmb->pmy_mesh;
 
-  for (std::int64_t n = 0; n < n_new; n++) {
-    free_indices_.push_back(n + n_new_begin);
-  }
-
   // Rely on Kokkos setting the newly added values to false for these arrays
   Kokkos::resize(mask_, nmax_pool);
   Kokkos::resize(marked_for_removal_, nmax_pool);
+  Kokkos::resize(empty_indices_, nmax_pool);
   Kokkos::resize(new_indices_, nmax_pool);
-  Kokkos::resize(from_to_indices_, nmax_pool + 1);
-  Kokkos::resize(recv_neighbor_index_, nmax_pool);
-  Kokkos::resize(recv_buffer_index_, nmax_pool);
+  Kokkos::resize(scratch_a_, nmax_pool);
+  Kokkos::resize(scratch_b_, nmax_pool);
+
   pmb->LogMemUsage(2 * n_new * sizeof(bool));
 
   Kokkos::resize(cell_sorted_, nmax_pool);
@@ -240,7 +228,10 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {
 
   nmax_pool_ = nmax_pool;
 
-  // Eliminate any cached SwarmPacks, as they will need to be rebuilt following setPoolMax
+  // Populate new empty indices
+  UpdateEmptyIndices();
+
+  // Eliminate any cached SwarmPacks, as they will need to be rebuilt following SetPoolMax
   pmb->meshblock_data.Get()->ClearSwarmCaches();
   pm->mesh_data.Get("base")->ClearSwarmCaches();
   for (auto &partition : pm->GetDefaultBlockPartitions()) {
@@ -251,129 +242,153 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {
 NewParticlesContext Swarm::AddEmptyParticles(const int num_to_add) {
   PARTHENON_DEBUG_REQUIRE(num_to_add >= 0, "Cannot add negative numbers of particles!");
 
+  auto pmb = GetBlockPointer();
+
   if (num_to_add > 0) {
-    while (free_indices_.size() < num_to_add) {
-      increasePoolMax();
+    while (nmax_pool_ - num_active_ < num_to_add) {
+      IncreasePoolMax();
     }
 
-    // TODO(BRR) Use par_scan on device rather than do this on host
-    auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-
-    auto block_index_h = block_index_.GetHostMirrorAndCopy();
+    auto &new_indices = new_indices_;
+    auto &empty_indices = empty_indices_;
+    auto &mask = mask_;
 
-    auto free_index = free_indices_.begin();
+    int max_new_active_index = 0;
+    parthenon::par_reduce(
+        PARTHENON_AUTO_LABEL, 0, num_to_add - 1,
+        KOKKOS_LAMBDA(const int n, int &max_ind) {
+          new_indices(n) = empty_indices(n);
+          mask(new_indices(n)) = true;
 
-    auto new_indices_h = new_indices_.GetHostMirror();
-
-    // Don't bother sanitizing the memory
-    for (int n = 0; n < num_to_add; n++) {
-      mask_h(*free_index) = true;
-      block_index_h(*free_index) = this_block_;
-      max_active_index_ = std::max<int>(max_active_index_, *free_index);
-      new_indices_h(n) = *free_index;
-
-      free_index = free_indices_.erase(free_index);
-    }
+          // Record vote for max active index
+          max_ind = new_indices(n);
+        },
+        Kokkos::Max<int>(max_new_active_index));
 
-    new_indices_.DeepCopy(new_indices_h);
+    // Update max active index if necessary
+    max_active_index_ = std::max(max_active_index_, max_new_active_index);
 
+    new_indices_max_idx_ = num_to_add - 1;
     num_active_ += num_to_add;
 
-    Kokkos::deep_copy(mask_, mask_h);
-    block_index_.DeepCopy(block_index_h);
-    new_indices_max_idx_ = num_to_add - 1;
+    UpdateEmptyIndices();
   } else {
     new_indices_max_idx_ = -1;
   }
 
+  // Create and return NewParticlesContext
   return NewParticlesContext(new_indices_max_idx_, new_indices_);
 }
 
+// Updates the empty_indices_ array so the first N elements contain an ascending list of
+// indices into empty elements of the swarm pool, where N is the number of empty indices
+void Swarm::UpdateEmptyIndices() {
+  auto &mask = mask_;
+  auto &empty_indices = empty_indices_;
+
+  // Associate scratch memory
+  auto &empty_indices_scan = scratch_a_;
+
+  // Calculate prefix sum of empty indices
+  parthenon::par_scan(
+      "Set empty indices prefix sum", 0, nmax_pool_ - 1,
+      KOKKOS_LAMBDA(const int n, int &update, const bool &final) {
+        const int val = !mask(n);
+        if (val) {
+          update += 1;
+        }
+
+        if (final) {
+          empty_indices_scan(n) = update;
+        }
+      });
+
+  // Update list of empty indices such that it is contiguous and in ascending order
+  parthenon::par_for(
+      PARTHENON_AUTO_LABEL, 0, nmax_pool_ - 1, KOKKOS_LAMBDA(const int n) {
+        if (!mask(n)) {
+          empty_indices(empty_indices_scan(n) - 1) = n;
+        }
+      });
+}
+
 // No active particles: nmax_active_index = inactive_max_active_index (= -1)
 // No particles removed: nmax_active_index unchanged
 // Particles removed: nmax_active_index is new max active index
 void Swarm::RemoveMarkedParticles() {
-  // TODO(BRR) Use par_scan to do this on device rather than host
-  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-  auto marked_for_removal_h =
-      Kokkos::create_mirror_view_and_copy(HostMemSpace(), marked_for_removal_);
-
-  // loop backwards to keep free_indices_ updated correctly
-  for (int n = max_active_index_; n >= 0; n--) {
-    if (mask_h(n)) {
-      if (marked_for_removal_h(n)) {
-        mask_h(n) = false;
-        free_indices_.push_front(n);
-        num_active_ -= 1;
-        if (n == max_active_index_) {
-          max_active_index_ -= 1;
+  int &max_active_index = max_active_index_;
+
+  auto &mask = mask_;
+  auto &marked_for_removal = marked_for_removal_;
+
+  // Update mask, count number of removed particles
+  int num_removed = 0;
+  parthenon::par_reduce(
+      PARTHENON_AUTO_LABEL, 0, max_active_index,
+      KOKKOS_LAMBDA(const int n, int &removed) {
+        if (mask(n)) {
+          if (marked_for_removal(n)) {
+            mask(n) = false;
+            marked_for_removal(n) = false;
+            removed += 1;
+          }
         }
-        marked_for_removal_h(n) = false;
-      }
-    }
-  }
+      },
+      Kokkos::Sum<int>(num_removed));
 
-  Kokkos::deep_copy(mask_, mask_h);
-  Kokkos::deep_copy(marked_for_removal_, marked_for_removal_h);
+  num_active_ -= num_removed;
+
+  UpdateEmptyIndices();
 }
 
 void Swarm::Defrag() {
   if (GetNumActive() == 0) {
     return;
   }
-  // TODO(BRR) Could this algorithm be more efficient? Does it matter?
-  // Add 1 to convert max index to max number
-  std::int64_t num_free = (max_active_index_ + 1) - num_active_;
-  auto pmb = GetBlockPointer();
 
-  auto from_to_indices_h = from_to_indices_.GetHostMirror();
+  // Associate scratch memory
+  auto &scan_scratch_toread = scratch_a_;
+  auto &map = scratch_b_;
 
-  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-
-  for (int n = 0; n <= max_active_index_; n++) {
-    from_to_indices_h(n) = unset_index_;
-  }
-
-  std::list<int> new_free_indices;
-
-  free_indices_.sort();
-
-  int index = max_active_index_;
-  int num_to_move = std::min<int>(num_free, num_active_);
-  for (int n = 0; n < num_to_move; n++) {
-    while (mask_h(index) == false) {
-      index--;
-    }
-    int index_to_move_from = index;
-    index--;
-
-    // Below this number "moved" particles should actually stay in place
-    if (index_to_move_from < num_active_) {
-      break;
-    }
-    int index_to_move_to = free_indices_.front();
-    free_indices_.pop_front();
-    new_free_indices.push_back(index_to_move_from);
-    from_to_indices_h(index_to_move_from) = index_to_move_to;
-  }
+  auto &mask = mask_;
 
-  // TODO(BRR) Not all these sorts may be necessary
-  new_free_indices.sort();
-  free_indices_.merge(new_free_indices);
+  const int &num_active = num_active_;
+  parthenon::par_scan(
+      "Set empty indices prefix sum", 0, nmax_pool_ - num_active_ - 1,
+      KOKKOS_LAMBDA(const int nn, int &update, const bool &final) {
+        const int n = nn + num_active;
+        const int val = mask(n);
+        if (val) {
+          update += 1;
+        }
+        if (final) scan_scratch_toread(n) = update;
+      });
 
-  from_to_indices_.DeepCopy(from_to_indices_h);
+  parthenon::par_for(
+      PARTHENON_AUTO_LABEL, 0, nmax_pool_ - 1, KOKKOS_LAMBDA(const int n) {
+        if (n >= num_active) {
+          if (mask(n)) {
+            map(scan_scratch_toread(n) - 1) = n;
+          }
+          mask(n) = false;
+        }
+      });
 
-  auto from_to_indices = from_to_indices_;
+  // Reuse scratch memory
+  auto &scan_scratch_towrite = scan_scratch_toread;
 
-  auto &mask = mask_;
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
-        if (from_to_indices(n) >= 0) {
-          mask(from_to_indices(n)) = mask(n);
-          mask(n) = false;
+  // Update list of empty indices
+  parthenon::par_scan(
+      "Set empty indices prefix sum", 0, num_active_ - 1,
+      KOKKOS_LAMBDA(const int n, int &update, const bool &final) {
+        const int val = !mask(n);
+        if (val) {
+          update += 1;
         }
+        if (final) scan_scratch_towrite(n) = update;
       });
 
+  // Get all dynamical variables in swarm
   auto &int_vector = std::get<getType<int>()>(vectors_);
   auto &real_vector = std::get<getType<Real>()>(vectors_);
   PackIndexMap real_imap;
@@ -387,15 +402,19 @@ void Swarm::Defrag() {
   const int realPackDim = vreal.GetDim(2);
   const int intPackDim = vint.GetDim(2);
 
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
-        if (from_to_indices(n) >= 0) {
+  // Loop over only the active number of particles, and if mask is empty, copy in particle
+  // using address from prefix sum
+  parthenon::par_for(
+      PARTHENON_AUTO_LABEL, 0, num_active_ - 1, KOKKOS_LAMBDA(const int n) {
+        if (!mask(n)) {
+          const int nread = map(scan_scratch_towrite(n) - 1);
           for (int vidx = 0; vidx < realPackDim; vidx++) {
-            vreal(vidx, from_to_indices(n)) = vreal(vidx, n);
+            vreal(vidx, n) = vreal(vidx, nread);
           }
           for (int vidx = 0; vidx < intPackDim; vidx++) {
-            vint(vidx, from_to_indices(n)) = vint(vidx, n);
+            vint(vidx, n) = vint(vidx, nread);
           }
+          mask(n) = true;
         }
       });
 
diff --git a/src/interface/swarm.hpp b/src/interface/swarm.hpp
index a3e366789986..8eb31a3d1cd7 100644
--- a/src/interface/swarm.hpp
+++ b/src/interface/swarm.hpp
@@ -161,10 +161,10 @@ class Swarm {
   std::string info() const { return info_; }
 
   /// Expand pool size geometrically as necessary
-  void increasePoolMax() { setPoolMax(2 * nmax_pool_); }
+  void IncreasePoolMax() { SetPoolMax(2 * nmax_pool_); }
 
   /// Set max pool size
-  void setPoolMax(const std::int64_t nmax_pool);
+  void SetPoolMax(const std::int64_t nmax_pool);
 
   /// Check whether metadata bit is set
   bool IsSet(const MetadataFlag bit) const { return m_.IsSet(bit); }
@@ -182,6 +182,9 @@ class Swarm {
   /// indicates gaps in the list.
   Real GetPackingEfficiency() const { return num_active_ / (max_active_index_ + 1); }
 
+  // Update sorted array of empty indices in the current memory pool
+  void UpdateEmptyIndices();
+
   /// Remove particles marked for removal and update internal indexing
   void RemoveMarkedParticles();
 
@@ -227,14 +230,10 @@ class Swarm {
   SwarmVariablePack<T> PackVariables(const std::vector<std::string> &name,
                                      PackIndexMap &vmap);
 
-  // Temporarily public
-  int num_particles_sent_;
-  bool finished_transport;
-
-  void LoadBuffers_(const int max_indices_size);
+  void LoadBuffers_();
   void UnloadBuffers_();
 
-  int CountParticlesToSend_(); // Must be public for launching kernel
+  void CountParticlesToSend_(); // Must be public for launching kernel
 
   template <typename T>
   const auto &GetVariableVector() const {
@@ -251,10 +250,6 @@ class Swarm {
 
   void SetNeighborIndices_();
 
-  void CountReceivedParticles_();
-  void UpdateNeighborBufferReceiveIndices_(ParArray1D<int> &neighbor_index,
-                                           ParArray1D<int> &buffer_index);
-
   template <class T>
   SwarmVariablePack<T> PackAllVariables_(PackIndexMap &vmap);
 
@@ -273,28 +268,25 @@ class Swarm {
 
   std::tuple<MapToParticle<int>, MapToParticle<Real>> maps_;
 
-  std::list<int> free_indices_;
   ParArray1D<bool> mask_;
   ParArray1D<bool> marked_for_removal_;
+  ParArray1D<int> empty_indices_; // Indices of empty slots in particle pool
   ParArrayND<int> block_index_; // Neighbor index for each particle. -1 for current block.
   ParArrayND<int> neighbor_indices_; // Indexing of vbvar's neighbor array. -1 for same.
                                      // k,j indices unused in 3D&2D, 2D, respectively
-  ParArray1D<int> new_indices_;     // Persistent array that provides the new indices when
-                                    // AddEmptyParticles is called. Always defragmented.
-  int new_indices_max_idx_;         // Maximum valid index of new_indices_ array.
-  ParArray1D<int> from_to_indices_; // Array used for sorting particles during defragment
-                                    // step (size nmax_pool + 1).
-  ParArray1D<int> recv_neighbor_index_; // Neighbor indices for received particles
-  ParArray1D<int> recv_buffer_index_;   // Buffer indices for received particles
+  ParArray1D<int> new_indices_; // Persistent array that provides the new indices when
+                                // AddEmptyParticles is called. Always defragmented.
+  int new_indices_max_idx_;     // Maximum valid index of new_indices_ array.
+  ParArray1D<int> scratch_a_;   // Scratch memory for index sorting
+  ParArray1D<int> scratch_b_;   // Scratch memory for index sorting
 
   constexpr static int no_block_ = -2;
   constexpr static int this_block_ = -1;
   constexpr static int unset_index_ = -1;
 
   ParArray1D<int> num_particles_to_send_;
-  ParArrayND<int> particle_indices_to_send_;
-
-  std::vector<int> neighbor_received_particles_;
+  ParArray1D<int> buffer_counters_;
+  ParArray1D<int> neighbor_received_particles_;
   int total_received_particles_;
 
   ParArrayND<int> neighbor_buffer_index_; // Map from neighbor index to neighbor bufid
diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
index 07be992ac727..650bcef09304 100644
--- a/src/interface/swarm_comms.cpp
+++ b/src/interface/swarm_comms.cpp
@@ -130,36 +130,6 @@ void Swarm::SetNeighborIndices_() {
     }
   }
 
-  // Draft alternative approach due to LFR utilizing mesh offset comparisons at the
-  // highest refinement level
-  // auto ll_block = pmb->loc.GetDaughter(0, 0, 0);
-  // int finest_level = pmb->loc.level() + 1;
-  // for (auto &n : pmb->neighbors) {
-  //  std::vector<LogicalLocation> dlocs;
-  //  auto &nloc =
-  //      n.loc; // Would need to use the location in the coordinates of the origin tree
-  //  if (nloc.level() == finest_level) {
-  //    dlocs.emplace_back(nloc);
-  //  } else if (nloc.level() == finest_level) {
-  //    dlocs = nloc.GetDaughters(ndim);
-  //  } else if (nloc.level() == finest_level - 2) {
-  //    auto tlocs = nloc.GetDaughters(ndim);
-  //    for (auto &t : tlocs) {
-  //      auto gdlocs = t.GetDaughters(ndim);
-  //      dlocs.insert(dlocs.end(), gdlocs.begin(), gdlocs.end());
-  //    }
-  //  } else {
-  //    PARTHENON_FAIL("Proper nesting is not being respected.");
-  //  }
-  //  for (auto &d : dlocs) {
-  //    const int k = d.lx3() - ll_block.lx3() + 1;
-  //    const int j = d.lx2() - ll_block.lx2() + 1;
-  //    const int i = d.lx1() - ll_block.lx1() + 1;
-  //    if (i >= 0 && i <= 3 && j >= 0 && j <= 3 && k >= 0 && k <= 3)
-  //      neighbor_indices_h(k, j, i) = n.gid;
-  //  }
-  //}
-
   neighbor_indices_.DeepCopy(neighbor_indices_h);
 }
 
@@ -174,8 +144,6 @@ void Swarm::SetupPersistentMPI() {
   // Build up convenience array of neighbor indices
   SetNeighborIndices_();
 
-  neighbor_received_particles_.resize(nbmax);
-
   // Build device array mapping neighbor index to neighbor bufid
   if (pmb->neighbors.size() > 0) {
     ParArrayND<int> neighbor_buffer_index("Neighbor buffer index", pmb->neighbors.size());
@@ -188,7 +156,7 @@ void Swarm::SetupPersistentMPI() {
   }
 }
 
-int Swarm::CountParticlesToSend_() {
+void Swarm::CountParticlesToSend_() {
   auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
@@ -197,10 +165,6 @@ int Swarm::CountParticlesToSend_() {
   // Fence to make sure particles aren't currently being transported locally
   // TODO(BRR) do this operation on device.
   pmb->exec_space.fence();
-  auto num_particles_to_send_h = num_particles_to_send_.GetHostMirror();
-  for (int n = 0; n < pmb->neighbors.size(); n++) {
-    num_particles_to_send_h(n) = 0;
-  }
   const int particle_size = GetParticleDataSize();
   vbswarm->particle_size = particle_size;
 
@@ -218,63 +182,31 @@ int Swarm::CountParticlesToSend_() {
         }
       });
 
-  int max_indices_size = 0;
-  int total_noblock_particles = 0;
-  auto block_index_h = block_index_.GetHostMirrorAndCopy();
-  for (int n = 0; n <= max_active_index_; n++) {
-    if (mask_h(n)) {
-      // This particle should be sent
-      if (block_index_h(n) >= 0) {
-        num_particles_to_send_h(block_index_h(n))++;
-        if (max_indices_size < num_particles_to_send_h(block_index_h(n))) {
-          max_indices_size = num_particles_to_send_h(block_index_h(n));
-        }
-      }
-      if (block_index_h(n) == no_block_) {
-        total_noblock_particles++;
-      }
-    }
-  }
-  // Size-0 arrays not permitted but we don't want to short-circuit subsequent logic
-  // that indicates completed communications
-  max_indices_size = std::max<int>(1, max_indices_size);
-
-  // Not a ragged-right array, just for convenience
-  if (total_noblock_particles > 0) {
-    auto noblock_indices =
-        ParArray1D<int>("Particles with no block", total_noblock_particles);
-    auto noblock_indices_h = noblock_indices.GetHostMirror();
-    int counter = 0;
-    for (int n = 0; n <= max_active_index_; n++) {
-      if (mask_h(n)) {
-        if (block_index_h(n) == no_block_) {
-          noblock_indices_h(counter) = n;
-          counter++;
+  // Facilitate lambda captures
+  auto &block_index = block_index_;
+  auto &num_particles_to_send = num_particles_to_send_;
+
+  // Zero out number of particles to send before accumulating
+  pmb->par_for(
+      PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
+      KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; });
+
+  parthenon::par_for(
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+        if (swarm_d.IsActive(n)) {
+          bool on_current_mesh_block = true;
+          swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
+
+          if (block_index(n) >= 0) {
+            Kokkos::atomic_add(&num_particles_to_send(block_index(n)), 1);
+          }
         }
-      }
-    }
-    noblock_indices.DeepCopy(noblock_indices_h);
-  }
+      });
 
-  // TODO(BRR) don't allocate dynamically
-  particle_indices_to_send_ =
-      ParArrayND<int>("Particle indices to send", nbmax, max_indices_size);
-  auto particle_indices_to_send_h = particle_indices_to_send_.GetHostMirror();
-  std::vector<int> counter(nbmax, 0);
-  for (int n = 0; n <= max_active_index_; n++) {
-    if (mask_h(n)) {
-      if (block_index_h(n) >= 0) {
-        particle_indices_to_send_h(block_index_h(n), counter[block_index_h(n)]) = n;
-        counter[block_index_h(n)]++;
-      }
-    }
-  }
-  num_particles_to_send_.DeepCopy(num_particles_to_send_h);
-  particle_indices_to_send_.DeepCopy(particle_indices_to_send_h);
+  auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
 
-  num_particles_sent_ = 0;
+  // Resize send buffers if too small
   for (int n = 0; n < pmb->neighbors.size(); n++) {
-    // Resize buffer if too small
     const int bufid = pmb->neighbors[n].bufid;
     auto sendbuf = vbswarm->bd_var_.send[bufid];
     if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
@@ -282,13 +214,10 @@ int Swarm::CountParticlesToSend_() {
       vbswarm->bd_var_.send[bufid] = sendbuf;
     }
     vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
-    num_particles_sent_ += num_particles_to_send_h(n);
   }
-
-  return max_indices_size;
 }
 
-void Swarm::LoadBuffers_(const int max_indices_size) {
+void Swarm::LoadBuffers_() {
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
   const int particle_size = GetParticleDataSize();
@@ -303,34 +232,45 @@ void Swarm::LoadBuffers_(const int max_indices_size) {
   const int realPackDim = vreal.GetDim(2);
   const int intPackDim = vint.GetDim(2);
 
-  // Pack index:
-  // [variable start] [swarm idx]
+  auto &x = Get<Real>(swarm_position::x::name()).Get();
+  auto &y = Get<Real>(swarm_position::y::name()).Get();
+  auto &z = Get<Real>(swarm_position::z::name()).Get();
+
+  // Zero buffer index counters
+  auto &buffer_counters = buffer_counters_;
+  pmb->par_for(
+      PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
+      KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; });
 
   auto &bdvar = vbswarm->bd_var_;
-  auto num_particles_to_send = num_particles_to_send_;
-  auto particle_indices_to_send = particle_indices_to_send_;
   auto neighbor_buffer_index = neighbor_buffer_index_;
+  // Loop over active particles and use atomic operations to find indices into buffers if
+  // this particle will be sent.
   pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_indices_size - 1,
-      KOKKOS_LAMBDA(const int n) {            // Max index
-        for (int m = 0; m < nneighbor; m++) { // Number of neighbors
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+        if (swarm_d.IsActive(n)) {
+          bool on_current_mesh_block = true;
+          const int m =
+              swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
           const int bufid = neighbor_buffer_index(m);
-          if (n < num_particles_to_send(m)) {
-            const int sidx = particle_indices_to_send(m, n);
-            int buffer_index = n * particle_size;
-            swarm_d.MarkParticleForRemoval(sidx);
+
+          if (m >= 0) {
+            const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1);
+            int buffer_index = bid * particle_size;
+            swarm_d.MarkParticleForRemoval(n);
             for (int i = 0; i < realPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = vreal(i, sidx);
+              bdvar.send[bufid](buffer_index) = vreal(i, n);
               buffer_index++;
             }
             for (int i = 0; i < intPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, sidx));
+              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, n));
               buffer_index++;
             }
           }
         }
       });
 
+  // Remove particles that were loaded to send to another block from this block
   RemoveMarkedParticles();
 }
 
@@ -339,99 +279,42 @@ void Swarm::Send(BoundaryCommSubset phase) {
   const int nneighbor = pmb->neighbors.size();
   auto swarm_d = GetDeviceContext();
 
-  if (nneighbor == 0) {
-    // TODO(BRR) Do we ever reach this branch?
-    // Process physical boundary conditions on "sent" particles
-    auto block_index_h = block_index_.GetHostMirrorAndCopy();
-    auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-
-    int total_sent_particles = 0;
-    pmb->par_reduce(
-        PARTHENON_AUTO_LABEL, 0, max_active_index_,
-        KOKKOS_LAMBDA(int n, int &total_sent_particles) {
-          if (swarm_d.IsActive(n)) {
-            if (!swarm_d.IsOnCurrentMeshBlock(n)) {
-              total_sent_particles++;
-            }
-          }
-        },
-        Kokkos::Sum<int>(total_sent_particles));
-
-    if (total_sent_particles > 0) {
-      ParArray1D<int> new_indices("new indices", total_sent_particles);
-      auto new_indices_h = new_indices.GetHostMirrorAndCopy();
-      int sent_particle_index = 0;
-      for (int n = 0; n <= max_active_index_; n++) {
-        if (mask_h(n)) {
-          if (block_index_h(n) >= 0 || block_index_h(n) == no_block_) {
-            new_indices_h(sent_particle_index) = n;
-            sent_particle_index++;
-          }
-        }
-      }
-      new_indices.DeepCopy(new_indices_h);
-    }
-  } else {
-    // Query particles for those to be sent
-    int max_indices_size = CountParticlesToSend_();
+  // Query particles for those to be sent
+  CountParticlesToSend_();
 
-    // Prepare buffers for send operations
-    LoadBuffers_(max_indices_size);
+  // Prepare buffers for send operations
+  LoadBuffers_();
 
-    // Send buffer data
-    vbswarm->Send(phase);
-  }
+  // Send buffer data
+  vbswarm->Send(phase);
 }
 
-void Swarm::CountReceivedParticles_() {
+void Swarm::UnloadBuffers_() {
   auto pmb = GetBlockPointer();
+
+  // Count received particles
   total_received_particles_ = 0;
+  auto &neighbor_received_particles = neighbor_received_particles_;
+  auto neighbor_received_particles_h = neighbor_received_particles.GetHostMirror();
   for (int n = 0; n < pmb->neighbors.size(); n++) {
     const int bufid = pmb->neighbors[n].bufid;
     if (vbswarm->bd_var_.flag[bufid] == BoundaryStatus::arrived) {
       PARTHENON_DEBUG_REQUIRE(vbswarm->recv_size[bufid] % vbswarm->particle_size == 0,
                               "Receive buffer is not divisible by particle size!");
-      neighbor_received_particles_[n] =
+      neighbor_received_particles_h(n) =
           vbswarm->recv_size[bufid] / vbswarm->particle_size;
-      total_received_particles_ += neighbor_received_particles_[n];
+      total_received_particles_ += neighbor_received_particles_h(n);
     } else {
-      neighbor_received_particles_[n] = 0;
+      neighbor_received_particles_h(n) = 0;
     }
   }
-}
-
-void Swarm::UpdateNeighborBufferReceiveIndices_(ParArray1D<int> &neighbor_index,
-                                                ParArray1D<int> &buffer_index) {
-  auto pmb = GetBlockPointer();
-  auto neighbor_index_h = neighbor_index.GetHostMirror();
-  auto buffer_index_h =
-      buffer_index.GetHostMirror(); // Index of each particle in its received buffer
-
-  int id = 0;
-  for (int n = 0; n < pmb->neighbors.size(); n++) {
-    for (int m = 0; m < neighbor_received_particles_[n]; m++) {
-      neighbor_index_h(id) = n;
-      buffer_index_h(id) = m;
-      id++;
-    }
-  }
-  neighbor_index.DeepCopy(neighbor_index_h);
-  buffer_index.DeepCopy(buffer_index_h);
-}
-
-void Swarm::UnloadBuffers_() {
-  auto pmb = GetBlockPointer();
-
-  CountReceivedParticles_();
 
   auto &bdvar = vbswarm->bd_var_;
+  const int nbmax = vbswarm->bd_var_.nbmax;
 
   if (total_received_particles_ > 0) {
     auto newParticlesContext = AddEmptyParticles(total_received_particles_);
 
-    auto &recv_neighbor_index = recv_neighbor_index_;
-    auto &recv_buffer_index = recv_buffer_index_;
-    UpdateNeighborBufferReceiveIndices_(recv_neighbor_index, recv_buffer_index);
     auto neighbor_buffer_index = neighbor_buffer_index_;
 
     auto &int_vector = std::get<getType<int>()>(vectors_);
@@ -443,18 +326,37 @@ void Swarm::UnloadBuffers_() {
     int realPackDim = vreal.GetDim(2);
     int intPackDim = vint.GetDim(2);
 
-    // construct map from buffer index to swarm index (or just return vector of
-    // indices!)
     const int particle_size = GetParticleDataSize();
     auto swarm_d = GetDeviceContext();
 
+    // Change meaning of neighbor_received_particles from particles per neighbor to
+    // cumulative particles per neighbor
+    int val_prev = 0;
+    for (int n = 0; n < nbmax; n++) {
+      int val_curr = neighbor_received_particles_h(n);
+      neighbor_received_particles_h(n) += val_prev;
+      val_prev += val_curr;
+    }
+    neighbor_received_particles.DeepCopy(neighbor_received_particles_h);
+
+    auto &x = Get<Real>(swarm_position::x::name()).Get();
+    auto &y = Get<Real>(swarm_position::y::name()).Get();
+    auto &z = Get<Real>(swarm_position::z::name()).Get();
+
     pmb->par_for(
         PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
         // n is both new particle index and index over buffer values
         KOKKOS_LAMBDA(const int n) {
           const int sid = newParticlesContext.GetNewParticleIndex(n);
-          const int nid = recv_neighbor_index(n);
-          int bid = recv_buffer_index(n) * particle_size;
+          // Search for neighbor id over cumulative indices
+          int nid = 0;
+          while (n >= neighbor_received_particles(nid) && nid < nbmax - 1) {
+            nid++;
+          }
+
+          // Convert neighbor id to buffer id
+          int bid = nid == 0 ? n * particle_size
+                             : (n - neighbor_received_particles(nid - 1)) * particle_size;
           const int nbid = neighbor_buffer_index(nid);
           for (int i = 0; i < realPackDim; i++) {
             vreal(i, sid) = bdvar.recv[nbid](bid);
diff --git a/src/interface/swarm_container.cpp b/src/interface/swarm_container.cpp
index c449ef084c6a..9fc26a213e03 100644
--- a/src/interface/swarm_container.cpp
+++ b/src/interface/swarm_container.cpp
@@ -36,36 +36,6 @@ void SwarmContainer::Initialize(const std::shared_ptr<StateDescriptor> resolved_
   }
 }
 
-void SwarmContainer::InitializeBoundaries(const std::shared_ptr<MeshBlock> pmb) {
-  if (swarmVector_.empty()) {
-    // No Swarms in this container, so no need to initialize boundaries
-    // This allows default reflecting boundary conditions to be used when no
-    // swarms are present in a parthenon calculation.
-    // NOTE SwarmContainer::Initialize must have already been called.
-    return;
-  }
-
-  std::stringstream msg;
-  auto &bcs = pmb->pmy_mesh->mesh_bcs;
-  // Check that, if we are using user BCs, they are actually enrolled, and unsupported BCs
-  // are not being used
-  for (int iFace = 0; iFace < 6; iFace++) {
-    if (bcs[iFace] == BoundaryFlag::user) {
-      if (pmb->pmy_mesh->forest.GetTreePtr(pmb->loc.tree())->SwarmBndryFnctn[iFace] ==
-          nullptr) {
-        msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1
-            << " user boundary requested but provided function is null!";
-        PARTHENON_FAIL(msg);
-      }
-    } else if (bcs[iFace] != BoundaryFlag::outflow &&
-               bcs[iFace] != BoundaryFlag::periodic) {
-      msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1 << " boundary flag "
-          << static_cast<int>(bcs[iFace]) << " not supported!";
-      PARTHENON_FAIL(msg);
-    }
-  }
-}
-
 void SwarmContainer::Add(const std::vector<std::string> &labelArray,
                          const Metadata &metadata) {
   // generate the vector and call Add
diff --git a/src/interface/swarm_container.hpp b/src/interface/swarm_container.hpp
index 3e2072db3a64..1a1486b2a44c 100644
--- a/src/interface/swarm_container.hpp
+++ b/src/interface/swarm_container.hpp
@@ -90,8 +90,6 @@ class SwarmContainer {
   void Initialize(const std::shared_ptr<StateDescriptor> resolved_packages,
                   const std::shared_ptr<MeshBlock> pmb);
 
-  void InitializeBoundaries(const std::shared_ptr<MeshBlock> pmb);
-
   void Add(std::shared_ptr<Swarm> swarm) {
     swarmVector_.push_back(swarm);
     swarmMap_[swarm->label()] = swarm;
diff --git a/src/mesh/forest/tree.cpp b/src/mesh/forest/tree.cpp
index d33fd757e353..fe1803882401 100644
--- a/src/mesh/forest/tree.cpp
+++ b/src/mesh/forest/tree.cpp
@@ -442,8 +442,6 @@ void Tree::EnrollBndryFncts(
       break;
     case BoundaryFlag::user:
       if (app_in->swarm_boundary_conditions[f] != nullptr) {
-        // This is checked to be non-null later in Swarm::AllocateBoundaries, in case user
-        // boundaries are requested but no swarms are used.
         SwarmBndryFnctn[f] = app_in->swarm_boundary_conditions[f];
       }
       break;
diff --git a/src/mesh/meshblock.cpp b/src/mesh/meshblock.cpp
index 3f84ce48293b..0bccf068576d 100644
--- a/src/mesh/meshblock.cpp
+++ b/src/mesh/meshblock.cpp
@@ -144,9 +144,6 @@ void MeshBlock::Initialize(int igid, int ilid, LogicalLocation iloc,
   auto &real_container = meshblock_data.Get();
   real_container->Initialize(shared_from_this());
 
-  // Initialize swarm boundary condition flags
-  real_container->GetSwarmData()->InitializeBoundaries(shared_from_this());
-
   // TODO(jdolence): Should these loops be moved to Variable creation
   // TODO(JMM): What variables should be in vars_cc_? They are used
   // for counting load-balance cost. Should it be different than the
diff --git a/src/tasks/tasks.hpp b/src/tasks/tasks.hpp
index 91b9e103d6aa..d432f0a7ea21 100644
--- a/src/tasks/tasks.hpp
+++ b/src/tasks/tasks.hpp
@@ -544,6 +544,8 @@ class TaskRegion {
 
  public:
   TaskRegion() = delete;
+  TaskRegion(const TaskRegion &) = delete; // Prevent copying TaskRegions during AddRegion
+                                           // calls which is a segfault
   explicit TaskRegion(const int num_lists) : task_lists(num_lists) {
     for (int i = 0; i < num_lists; i++)
       task_lists[i].SetID(i);
diff --git a/tst/unit/test_swarm.cpp b/tst/unit/test_swarm.cpp
index f365362ddcc6..bcff16106110 100644
--- a/tst/unit/test_swarm.cpp
+++ b/tst/unit/test_swarm.cpp
@@ -212,8 +212,8 @@ TEST_CASE("Swarm memory management", "[Swarm]") {
 
   // Check that data was moved during defrag
   x_h = swarm->Get<Real>(swarm_position::x::name()).Get().GetHostMirrorAndCopy();
-  REQUIRE(x_h(2) == 1.2);
-  REQUIRE(x_h(4) == 1.1);
+  REQUIRE(x_h(2) == 1.1);
+  REQUIRE(x_h(4) == 1.2);
   i_h = swarm->Get<int>("i").Get().GetHostMirrorAndCopy();
   REQUIRE(i_h(1) == 2);