diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 4b95511d82f5..9e1cbcfad76f 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -12,9 +12,12 @@ on:
 jobs:
     docs:
       name: build and deploy docs
-      runs-on: ubuntu-latest
+      # not using latest due to issues with pip user packages, see
+      # https://github.com/actions/runner-images/issues/10781 and
+      # https://github.com/actions/runner-images/issues/10636
+      runs-on: ubuntu-22.04
 
-      steps:      
+      steps:
         - name: Checkout code
           uses: actions/checkout@v2
           with:
@@ -23,9 +26,9 @@ jobs:
           run: export DEBIAN_FRONTEND=noninteractive
         - name: install dependencies
           run: |
-            pip install --break-system-packages sphinx
-            pip install --break-system-packages sphinx-rtd-theme
-            pip install --break-system-packages sphinx-multiversion
+            pip install sphinx
+            pip install sphinx-rtd-theme
+            pip install sphinx-multiversion
         - name: build docs
           run: |
             echo "Repo = ${GITHUB_REPOSITORY}"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 56dcce3b2177..1d361fc0567f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 1103]](https://github.com/parthenon-hpc-lab/parthenon/pull/1103) Add sparsity to vector wave equation test
 - [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185/files) Bugfix to particle defragmentation
 - [[PR 1184]](https://github.com/parthenon-hpc-lab/parthenon/pull/1184) Fix swarm block neighbor indexing in 1D, 2D
 - [[PR 1183]](https://github.com/parthenon-hpc-lab/parthenon/pull/1183) Fix particle leapfrog example initialization data
@@ -11,6 +12,8 @@
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR 1206]](https://github.com/parthenon-hpc-lab/parthenon/pull/1206) Leapfrog fix
+- [[PR1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image
 - [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
 - [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
@@ -19,6 +22,7 @@
 
 ### Fixed (not changing behavior/API/variables/...)
 - [[PR 1188]](https://github.com/parthenon-hpc-lab/parthenon/pull/1188) Fix hdf5 output issue for metadata none variables, update test.
+- [[PR 1170]](https://github.com/parthenon-hpc-lab/parthenon/pull/1170) Fixed incorrect initialization of array by a const not constexpr
 - [[PR 1189]](https://github.com/parthenon-hpc-lab/parthenon/pull/1189) Address CUDA MPI/ICP issue with Kokkos <=4.4.1
 - [[PR 1178]](https://github.com/parthenon-hpc-lab/parthenon/pull/1178) Fix issue with mesh pointer when using relative residual tolerance in BiCGSTAB solver.
 - [[PR 1173]](https://github.com/parthenon-hpc-lab/parthenon/pull/1173) Make debugging easier by making parthenon throw an error if ParameterInput is different on multiple MPI ranks.
diff --git a/example/fine_advection/advection_driver.cpp b/example/fine_advection/advection_driver.cpp
index c02b4cd555ff..ca55c04f8278 100644
--- a/example/fine_advection/advection_driver.cpp
+++ b/example/fine_advection/advection_driver.cpp
@@ -21,6 +21,7 @@
 #include "amr_criteria/refinement_package.hpp"
 #include "bvals/comms/bvals_in_one.hpp"
 #include "interface/metadata.hpp"
+#include "interface/state_descriptor.hpp"
 #include "interface/update.hpp"
 #include "mesh/meshblock_pack.hpp"
 #include "parthenon/driver.hpp"
@@ -72,6 +73,12 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
   TaskCollection tc;
   TaskID none(0);
 
+  std::shared_ptr<parthenon::StateDescriptor> pkg =
+      pmesh->packages.Get("advection_package");
+  const auto do_regular_advection = pkg->Param<bool>("do_regular_advection");
+  const auto do_fine_advection = pkg->Param<bool>("do_fine_advection");
+  const auto do_CT_advection = pkg->Param<bool>("do_CT_advection");
+
   // Build MeshBlockData containers that will be included in MeshData containers. It is
   // gross that this has to be done by hand.
   const auto &stage_name = integrator->stage_name;
@@ -118,35 +125,49 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
     auto flx = none;
     auto flx_fine = none;
     for (auto face : faces) {
-      flx = flx | tl.AddTask(none, advection_package::CalculateFluxes<pack_desc_t>, desc,
-                             face, parthenon::CellLevel::same, mc0.get());
-      flx_fine = flx_fine |
-                 tl.AddTask(none, advection_package::CalculateFluxes<pack_desc_fine_t>,
-                            desc_fine, face, parthenon::CellLevel::fine, mc0.get());
+      if (do_regular_advection) {
+        flx = flx | tl.AddTask(none, advection_package::CalculateFluxes<pack_desc_t>,
+                               desc, face, parthenon::CellLevel::same, mc0.get());
+      }
+      if (do_fine_advection) {
+        flx_fine = flx_fine |
+                   tl.AddTask(none, advection_package::CalculateFluxes<pack_desc_fine_t>,
+                              desc_fine, face, parthenon::CellLevel::fine, mc0.get());
+      }
     }
 
     auto vf_dep = none;
-    for (auto edge : std::vector<TE>{TE::E1, TE::E2, TE::E3}) {
-      vf_dep = tl.AddTask(vf_dep, advection_package::CalculateVectorFluxes<C, D>, edge,
-                          parthenon::CellLevel::same, 1.0, mc0.get());
-      vf_dep = tl.AddTask(vf_dep, advection_package::CalculateVectorFluxes<D, C>, edge,
-                          parthenon::CellLevel::same, -1.0, mc0.get());
+    if (do_CT_advection) {
+      for (auto edge : std::vector<TE>{TE::E1, TE::E2, TE::E3}) {
+        vf_dep = tl.AddTask(vf_dep, advection_package::CalculateVectorFluxes<C, D>, edge,
+                            parthenon::CellLevel::same, 1.0, mc0.get());
+        vf_dep = tl.AddTask(vf_dep, advection_package::CalculateVectorFluxes<D, C>, edge,
+                            parthenon::CellLevel::same, -1.0, mc0.get());
+      }
     }
 
     auto set_flx = parthenon::AddFluxCorrectionTasks(
         start_flxcor | flx | flx_fine | vf_dep, tl, mc0, pmesh->multilevel);
 
-    auto update =
-        AddUpdateTasks(set_flx, tl, parthenon::CellLevel::same, TT::Cell, beta, dt, desc,
-                       mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    auto update = set_flx;
+    if (do_regular_advection) {
+      update = AddUpdateTasks(set_flx, tl, parthenon::CellLevel::same, TT::Cell, beta, dt,
+                              desc, mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    }
 
-    auto update_fine =
-        AddUpdateTasks(set_flx, tl, parthenon::CellLevel::fine, TT::Cell, beta, dt,
-                       desc_fine, mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    auto update_fine = set_flx;
+    if (do_fine_advection) {
+      update_fine =
+          AddUpdateTasks(set_flx, tl, parthenon::CellLevel::fine, TT::Cell, beta, dt,
+                         desc_fine, mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    }
 
-    auto update_vec =
-        AddUpdateTasks(set_flx, tl, parthenon::CellLevel::same, TT::Face, beta, dt,
-                       desc_vec, mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    auto update_vec = set_flx;
+    if (do_CT_advection) {
+      update_vec =
+          AddUpdateTasks(set_flx, tl, parthenon::CellLevel::same, TT::Face, beta, dt,
+                         desc_vec, mbase.get(), mc0.get(), mdudt.get(), mc1.get());
+    }
 
     auto boundaries = parthenon::AddBoundaryExchangeTasks(
         update | update_vec | update_fine | start_send, tl, mc1, pmesh->multilevel);
@@ -155,7 +176,8 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
         tl.AddTask(boundaries, parthenon::Update::FillDerived<MeshData<Real>>, mc1.get());
 
     if (stage == integrator->nstages) {
-      auto new_dt = tl.AddTask(fill_derived, EstimateTimestep<MeshData<Real>>, mc1.get());
+      auto dealloc = tl.AddTask(fill_derived, SparseDealloc, mc1.get());
+      auto new_dt = tl.AddTask(dealloc, EstimateTimestep<MeshData<Real>>, mc1.get());
       if (pmesh->adaptive) {
         auto tag_refine =
             tl.AddTask(new_dt, parthenon::Refinement::Tag<MeshData<Real>>, mc1.get());
diff --git a/example/fine_advection/advection_package.cpp b/example/fine_advection/advection_package.cpp
index 8ca6c6458b20..428d91b9245f 100644
--- a/example/fine_advection/advection_package.cpp
+++ b/example/fine_advection/advection_package.cpp
@@ -65,34 +65,58 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
   }
   pkg->AddParam<>("profile", profile_str);
 
-  pkg->AddField<Conserved::phi_fine>(
-      Metadata({Metadata::Cell, Metadata::Fine, Metadata::Independent,
-                Metadata::WithFluxes, Metadata::FillGhost}));
-
-  pkg->AddField<Conserved::phi>(Metadata({Metadata::Cell, Metadata::Independent,
-                                          Metadata::WithFluxes, Metadata::FillGhost}));
-  pkg->AddField<Conserved::phi_fine_restricted>(
-      Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
-
-  Metadata m(
-      {Metadata::Face, Metadata::Independent, Metadata::WithFluxes, Metadata::FillGhost});
-  m.RegisterRefinementOps<parthenon::refinement_ops::ProlongateSharedMinMod,
-                          parthenon::refinement_ops::RestrictAverage,
-                          parthenon::refinement_ops::ProlongateInternalTothAndRoe>();
-  pkg->AddField<Conserved::C>(m);
-  pkg->AddField<Conserved::D>(m);
-  pkg->AddField<Conserved::recon>(Metadata(
-      {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{4}));
-  pkg->AddField<Conserved::recon_f>(Metadata(
-      {Metadata::Face, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{2}));
-  pkg->AddField<Conserved::C_cc>(Metadata(
-      {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{3}));
-  pkg->AddField<Conserved::D_cc>(Metadata(
-      {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{3}));
-  pkg->AddField<Conserved::divC>(
-      Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
-  pkg->AddField<Conserved::divD>(
-      Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
+  bool do_regular_advection =
+      pin->GetOrAddBoolean("Advection", "do_regular_advection", true);
+  pkg->AddParam<>("do_regular_advection", do_regular_advection);
+  if (do_regular_advection) {
+    int shape_size = pin->GetOrAddInteger("Advection", "shape_size", 1);
+    int sparse_size = pin->GetOrAddInteger("Advection", "sparse_size", 1);
+    pkg->AddParam<>("sparse_size", sparse_size);
+    Real alloc_threshold = pin->GetOrAddReal("Advection", "alloc_threshold", 1.e-6);
+    Real dealloc_threshold = pin->GetOrAddReal("Advection", "dealloc_threshold", 5.e-7);
+    Metadata m({Metadata::Cell, Metadata::Independent, Metadata::WithFluxes,
+                Metadata::FillGhost, Metadata::Sparse},
+               std::vector<int>{shape_size});
+    m.SetSparseThresholds(alloc_threshold, dealloc_threshold, 0.0);
+    std::vector<int> sparse_idxs(sparse_size);
+    std::iota(sparse_idxs.begin(), sparse_idxs.end(), 0);
+    pkg->AddSparsePool<Conserved::phi>(m, sparse_idxs);
+  }
+
+  bool do_fine_advection = pin->GetOrAddBoolean("Advection", "do_fine_advection", true);
+  pkg->AddParam<>("do_fine_advection", do_fine_advection);
+  if (do_fine_advection) {
+    pkg->AddField<Conserved::phi_fine>(
+        Metadata({Metadata::Cell, Metadata::Fine, Metadata::Independent,
+                  Metadata::WithFluxes, Metadata::FillGhost}));
+
+    pkg->AddField<Conserved::phi_fine_restricted>(
+        Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
+  }
+
+  bool do_CT_advection = pin->GetOrAddBoolean("Advection", "do_CT_advection", true);
+  pkg->AddParam<>("do_CT_advection", do_CT_advection);
+  if (do_CT_advection) {
+    auto m = Metadata({Metadata::Face, Metadata::Independent, Metadata::WithFluxes,
+                       Metadata::FillGhost});
+    m.RegisterRefinementOps<parthenon::refinement_ops::ProlongateSharedMinMod,
+                            parthenon::refinement_ops::RestrictAverage,
+                            parthenon::refinement_ops::ProlongateInternalTothAndRoe>();
+    pkg->AddField<Conserved::C>(m);
+    pkg->AddField<Conserved::D>(m);
+    pkg->AddField<Conserved::recon>(Metadata(
+        {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{4}));
+    pkg->AddField<Conserved::recon_f>(Metadata(
+        {Metadata::Face, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{2}));
+    pkg->AddField<Conserved::C_cc>(Metadata(
+        {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{3}));
+    pkg->AddField<Conserved::D_cc>(Metadata(
+        {Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, std::vector<int>{3}));
+    pkg->AddField<Conserved::divC>(
+        Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
+    pkg->AddField<Conserved::divD>(
+        Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}));
+  }
 
   pkg->CheckRefinementBlock = CheckRefinement;
   pkg->EstimateTimestepMesh = EstimateTimestep;
@@ -101,35 +125,42 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
 }
 
 AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
-  // refine on advected, for example.  could also be a derived quantity
-  static auto desc = parthenon::MakePackDescriptor<Conserved::phi>(rc);
-  auto pack = desc.GetPack(rc);
+  std::shared_ptr<StateDescriptor> pkg =
+      rc->GetMeshPointer()->packages.Get("advection_package");
+  auto do_regular_advection = pkg->Param<bool>("do_regular_advection");
+  if (do_regular_advection) {
+    // refine on advected, for example.  could also be a derived quantity
+    static auto desc = parthenon::MakePackDescriptor<Conserved::phi>(rc);
+    auto pack = desc.GetPack(rc);
 
-  auto pmb = rc->GetBlockPointer();
-  IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
-  IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
-  IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
+    auto pmb = rc->GetBlockPointer();
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
 
-  typename Kokkos::MinMax<Real>::value_type minmax;
-  parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(),
-      pack.GetLowerBoundHost(0), pack.GetUpperBoundHost(0), kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
-      KOKKOS_LAMBDA(const int n, const int k, const int j, const int i,
-                    typename Kokkos::MinMax<Real>::value_type &lminmax) {
-        lminmax.min_val =
-            (pack(n, k, j, i) < lminmax.min_val ? pack(n, k, j, i) : lminmax.min_val);
-        lminmax.max_val =
-            (pack(n, k, j, i) > lminmax.max_val ? pack(n, k, j, i) : lminmax.max_val);
-      },
-      Kokkos::MinMax<Real>(minmax));
+    typename Kokkos::MinMax<Real>::value_type minmax;
+    parthenon::par_reduce(
+        parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
+        pack.GetNBlocks() - 1, // Runs from [0, 0] since pack built from MeshBlockData
+        pack.GetLowerBoundHost(0), pack.GetUpperBoundHost(0), kb.s, kb.e, jb.s, jb.e,
+        ib.s, ib.e,
+        KOKKOS_LAMBDA(const int b, const int n, const int k, const int j, const int i,
+                      typename Kokkos::MinMax<Real>::value_type &lminmax) {
+          lminmax.min_val = (pack(b, n, k, j, i) < lminmax.min_val ? pack(b, n, k, j, i)
+                                                                   : lminmax.min_val);
+          lminmax.max_val = (pack(b, n, k, j, i) > lminmax.max_val ? pack(b, n, k, j, i)
+                                                                   : lminmax.max_val);
+        },
+        Kokkos::MinMax<Real>(minmax));
 
-  auto pkg = pmb->packages.Get("advection_package");
-  const auto &refine_tol = pkg->Param<Real>("refine_tol");
-  const auto &derefine_tol = pkg->Param<Real>("derefine_tol");
+    auto pkg = pmb->packages.Get("advection_package");
+    const auto &refine_tol = pkg->Param<Real>("refine_tol");
+    const auto &derefine_tol = pkg->Param<Real>("derefine_tol");
 
-  if (minmax.max_val > refine_tol && minmax.min_val < derefine_tol) return AmrTag::refine;
-  if (minmax.max_val < derefine_tol) return AmrTag::derefine;
+    if (minmax.max_val > refine_tol && minmax.min_val < derefine_tol)
+      return AmrTag::refine;
+    if (minmax.max_val < derefine_tol) return AmrTag::derefine;
+  }
   return AmrTag::same;
 }
 
@@ -175,74 +206,84 @@ TaskStatus FillDerived(MeshData<Real> *md) {
           md);
   auto pack = desc.GetPack(md);
 
+  std::shared_ptr<StateDescriptor> pkg =
+      md->GetMeshPointer()->packages.Get("advection_package");
+
   IndexRange ib = md->GetBoundsI(IndexDomain::interior);
   IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
   IndexRange kb = md->GetBoundsK(IndexDomain::interior);
   const int ndim = md->GetMeshPointer()->ndim;
   const int nghost = parthenon::Globals::nghost;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        const int kf = (ndim > 2) ? (k - nghost) * 2 + nghost : k;
-        const int jf = (ndim > 1) ? (j - nghost) * 2 + nghost : j;
-        const int fi = (ndim > 0) ? (i - nghost) * 2 + nghost : i;
-        pack(b, Conserved::phi_fine_restricted(), k, j, i) = 0.0;
-        Real ntot = 0.0;
-        for (int ioff = 0; ioff <= (ndim > 0); ++ioff)
-          for (int joff = 0; joff <= (ndim > 1); ++joff)
-            for (int koff = 0; koff <= (ndim > 2); ++koff) {
-              ntot += 1.0;
-              pack(b, Conserved::phi_fine_restricted(), k, j, i) +=
-                  pack(b, Conserved::phi_fine(), kf + koff, jf + joff, fi + ioff);
-            }
-        pack(b, Conserved::phi_fine_restricted(), k, j, i) /= ntot;
-      });
-
-  using TE = parthenon::TopologicalElement;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        pack(b, Conserved::C_cc(0), k, j, i) =
-            0.5 * (pack(b, TE::F1, Conserved::C(), k, j, i) +
-                   pack(b, TE::F1, Conserved::C(), k, j, i + (ndim > 0)));
-        pack(b, Conserved::C_cc(1), k, j, i) =
-            0.5 * (pack(b, TE::F2, Conserved::C(), k, j, i) +
-                   pack(b, TE::F2, Conserved::C(), k, j + (ndim > 1), i));
-        pack(b, Conserved::C_cc(2), k, j, i) =
-            0.5 * (pack(b, TE::F3, Conserved::C(), k, j, i) +
-                   pack(b, TE::F3, Conserved::C(), k + (ndim > 2), j, i));
-        auto &coords = pack.GetCoordinates(b);
-        pack(b, Conserved::divC(), k, j, i) =
-            (pack(b, TE::F1, Conserved::C(), k, j, i + (ndim > 0)) -
-             pack(b, TE::F1, Conserved::C(), k, j, i)) /
-                coords.Dxc<X1DIR>(k, j, i) +
-            (pack(b, TE::F2, Conserved::C(), k, j + (ndim > 1), i) -
-             pack(b, TE::F2, Conserved::C(), k, j, i)) /
-                coords.Dxc<X2DIR>(k, j, i) +
-            (pack(b, TE::F3, Conserved::C(), k + (ndim > 2), j, i) -
-             pack(b, TE::F3, Conserved::C(), k, j, i)) /
-                coords.Dxc<X3DIR>(k, j, i);
-
-        pack(b, Conserved::D_cc(0), k, j, i) =
-            0.5 * (pack(b, TE::F1, Conserved::D(), k, j, i) +
-                   pack(b, TE::F1, Conserved::D(), k, j, i + (ndim > 0)));
-        pack(b, Conserved::D_cc(1), k, j, i) =
-            0.5 * (pack(b, TE::F2, Conserved::D(), k, j, i) +
-                   pack(b, TE::F2, Conserved::D(), k, j + (ndim > 1), i));
-        pack(b, Conserved::D_cc(2), k, j, i) =
-            0.5 * (pack(b, TE::F3, Conserved::D(), k, j, i) +
-                   pack(b, TE::F3, Conserved::D(), k + (ndim > 2), j, i));
-        pack(b, Conserved::divD(), k, j, i) =
-            (pack(b, TE::F1, Conserved::D(), k, j, i + (ndim > 0)) -
-             pack(b, TE::F1, Conserved::D(), k, j, i)) /
-                coords.Dxc<X1DIR>(k, j, i) +
-            (pack(b, TE::F2, Conserved::D(), k, j + (ndim > 1), i) -
-             pack(b, TE::F2, Conserved::D(), k, j, i)) /
-                coords.Dxc<X2DIR>(k, j, i) +
-            (pack(b, TE::F3, Conserved::D(), k + (ndim > 2), j, i) -
-             pack(b, TE::F3, Conserved::D(), k, j, i)) /
-                coords.Dxc<X3DIR>(k, j, i);
-      });
+
+  auto do_fine_advection = pkg->Param<bool>("do_fine_advection");
+  if (do_fine_advection) {
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+        ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          const int kf = (ndim > 2) ? (k - nghost) * 2 + nghost : k;
+          const int jf = (ndim > 1) ? (j - nghost) * 2 + nghost : j;
+          const int fi = (ndim > 0) ? (i - nghost) * 2 + nghost : i;
+          pack(b, Conserved::phi_fine_restricted(), k, j, i) = 0.0;
+          Real ntot = 0.0;
+          for (int koff = 0; koff <= (ndim > 2); ++koff)
+            for (int joff = 0; joff <= (ndim > 1); ++joff)
+              for (int ioff = 0; ioff <= (ndim > 0); ++ioff) {
+                ntot += 1.0;
+                pack(b, Conserved::phi_fine_restricted(), k, j, i) +=
+                    pack(b, Conserved::phi_fine(), kf + koff, jf + joff, fi + ioff);
+              }
+          pack(b, Conserved::phi_fine_restricted(), k, j, i) /= ntot;
+        });
+  }
+
+  auto do_CT_advection = pkg->Param<bool>("do_CT_advection");
+  if (do_CT_advection) {
+    using TE = parthenon::TopologicalElement;
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+        ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          pack(b, Conserved::C_cc(0), k, j, i) =
+              0.5 * (pack(b, TE::F1, Conserved::C(), k, j, i) +
+                     pack(b, TE::F1, Conserved::C(), k, j, i + (ndim > 0)));
+          pack(b, Conserved::C_cc(1), k, j, i) =
+              0.5 * (pack(b, TE::F2, Conserved::C(), k, j, i) +
+                     pack(b, TE::F2, Conserved::C(), k, j + (ndim > 1), i));
+          pack(b, Conserved::C_cc(2), k, j, i) =
+              0.5 * (pack(b, TE::F3, Conserved::C(), k, j, i) +
+                     pack(b, TE::F3, Conserved::C(), k + (ndim > 2), j, i));
+          auto &coords = pack.GetCoordinates(b);
+          pack(b, Conserved::divC(), k, j, i) =
+              (pack(b, TE::F1, Conserved::C(), k, j, i + (ndim > 0)) -
+               pack(b, TE::F1, Conserved::C(), k, j, i)) /
+                  coords.Dxc<X1DIR>(k, j, i) +
+              (pack(b, TE::F2, Conserved::C(), k, j + (ndim > 1), i) -
+               pack(b, TE::F2, Conserved::C(), k, j, i)) /
+                  coords.Dxc<X2DIR>(k, j, i) +
+              (pack(b, TE::F3, Conserved::C(), k + (ndim > 2), j, i) -
+               pack(b, TE::F3, Conserved::C(), k, j, i)) /
+                  coords.Dxc<X3DIR>(k, j, i);
+
+          pack(b, Conserved::D_cc(0), k, j, i) =
+              0.5 * (pack(b, TE::F1, Conserved::D(), k, j, i) +
+                     pack(b, TE::F1, Conserved::D(), k, j, i + (ndim > 0)));
+          pack(b, Conserved::D_cc(1), k, j, i) =
+              0.5 * (pack(b, TE::F2, Conserved::D(), k, j, i) +
+                     pack(b, TE::F2, Conserved::D(), k, j + (ndim > 1), i));
+          pack(b, Conserved::D_cc(2), k, j, i) =
+              0.5 * (pack(b, TE::F3, Conserved::D(), k, j, i) +
+                     pack(b, TE::F3, Conserved::D(), k + (ndim > 2), j, i));
+          pack(b, Conserved::divD(), k, j, i) =
+              (pack(b, TE::F1, Conserved::D(), k, j, i + (ndim > 0)) -
+               pack(b, TE::F1, Conserved::D(), k, j, i)) /
+                  coords.Dxc<X1DIR>(k, j, i) +
+              (pack(b, TE::F2, Conserved::D(), k, j + (ndim > 1), i) -
+               pack(b, TE::F2, Conserved::D(), k, j, i)) /
+                  coords.Dxc<X2DIR>(k, j, i) +
+              (pack(b, TE::F3, Conserved::D(), k + (ndim > 2), j, i) -
+               pack(b, TE::F3, Conserved::D(), k, j, i)) /
+                  coords.Dxc<X3DIR>(k, j, i);
+        });
+  }
   return TaskStatus::complete;
 }
 } // namespace advection_package
diff --git a/example/fine_advection/advection_package.hpp b/example/fine_advection/advection_package.hpp
index 87e0c0b2edab..12b6b9b65aad 100644
--- a/example/fine_advection/advection_package.hpp
+++ b/example/fine_advection/advection_package.hpp
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include <parthenon/package.hpp>
+#include <utils/indexer.hpp>
 #include <utils/robust.hpp>
 
 #define VARIABLE(ns, varname)                                                            \
@@ -79,13 +80,20 @@ TaskStatus CalculateFluxes(pack_desc_t &desc, parthenon::TopologicalElement FACE
   IndexRange ib = md->GetBoundsI(cl, IndexDomain::interior, FACE);
   IndexRange jb = md->GetBoundsJ(cl, IndexDomain::interior, FACE);
   IndexRange kb = md->GetBoundsK(cl, IndexDomain::interior, FACE);
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, pack.GetLowerBoundHost(0),
-      pack.GetUpperBoundHost(0), // Warning: only works for dense variables
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
-        // Calculate the flux using upwind donor cell reconstruction
-        pack.flux(b, FACE, l, k, j, i) = v * pack(b, l, k + koff, j + joff, i + ioff);
+  constexpr int unused_scratch_size = 0;
+  constexpr int unused_scratch_level = 1;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, unused_scratch_size, unused_scratch_level, 0,
+      pack.GetNBlocks() - 1, kb.s, kb.e,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        for (int l = pack.GetLowerBound(b); l <= pack.GetUpperBound(b); ++l) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            // Calculate the flux using upwind donor cell reconstruction
+            pack.flux(b, FACE, l, k, j, i) = v * pack(b, l, k + koff, j + joff, i + ioff);
+          });
+        }
       });
   return TaskStatus::complete;
 }
@@ -116,23 +124,33 @@ TaskStatus CalculateVectorFluxes(parthenon::TopologicalElement edge,
   int koff = edge == TE::E3 ? ndim > 2 : 0;
   int joff = edge == TE::E2 ? ndim > 1 : 0;
   int ioff = edge == TE::E1 ? ndim > 0 : 0;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s - (ndim > 2),
-      kb.e + (ndim > 2), jb.s - (ndim > 1), jb.e + (ndim > 1), ib.s - 1, ib.e + 1,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        // Piecewise linear in the orthogonal directions, could do something better here
-        pack(b, TE::CC, recon(0), k, j, i) =
-            0.5 * (pack(b, fe, flux_var(), k, j, i) +
-                   pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
-        pack(b, TE::CC, recon(1), k, j, i) =
-            0.5 * (pack(b, fe, flux_var(), k, j, i) +
-                   pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
-        pack(b, TE::CC, recon(2), k, j, i) =
-            0.5 * (pack(b, fe, flux_var(), k, j, i) +
-                   pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
-        pack(b, TE::CC, recon(3), k, j, i) =
-            0.5 * (pack(b, fe, flux_var(), k, j, i) +
-                   pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
+  constexpr int scratch_size = 0;
+  constexpr int scratch_level = 1;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack.GetNBlocks() - 1,
+      kb.s - (ndim > 2), kb.e + (ndim > 2),
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s - (ndim > 1), jb.e + (ndim > 1)},
+                                   {ib.s - (ndim > 0), ib.e + (ndim > 0)});
+        if (pack.GetLowerBound(b, flux_var()) <= pack.GetUpperBound(b, flux_var())) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            // Piecewise linear in the orthogonal directions, could do something better
+            // here
+            pack(b, TE::CC, recon(0), k, j, i) =
+                0.5 * (pack(b, fe, flux_var(), k, j, i) +
+                       pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
+            pack(b, TE::CC, recon(1), k, j, i) =
+                0.5 * (pack(b, fe, flux_var(), k, j, i) +
+                       pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
+            pack(b, TE::CC, recon(2), k, j, i) =
+                0.5 * (pack(b, fe, flux_var(), k, j, i) +
+                       pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
+            pack(b, TE::CC, recon(3), k, j, i) =
+                0.5 * (pack(b, fe, flux_var(), k, j, i) +
+                       pack(b, fe, flux_var(), k + koff, j + joff, i + ioff));
+          });
+        }
       });
 
   // 2. Calculate the quartant averaged flux
@@ -143,20 +161,26 @@ TaskStatus CalculateVectorFluxes(parthenon::TopologicalElement edge,
   ib = md->GetBoundsI(cl, IndexDomain::interior, edge);
   jb = md->GetBoundsJ(cl, IndexDomain::interior, edge);
   kb = md->GetBoundsK(cl, IndexDomain::interior, edge);
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        pack.flux(b, edge, var(), k, j, i) = 0.0;
-        for (int dk = -koff; dk < 1; ++dk)
-          for (int dj = -joff; dj < 1; ++dj)
-            for (int di = -ioff; di < 1; ++di) {
-              // TODO(LFR): Pick out the correct component of the reconstructed flux,
-              // current version is not an issue for piecewise constant flux though.
-              pack.flux(b, edge, var(), k, j, i) +=
-                  pack(b, TE::CC, recon(0), k + dk, j + dj, i + di);
-            }
-        pack.flux(b, edge, var(), k, j, i) /= npoints;
-        pack.flux(b, edge, var(), k, j, i) *= fac;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack.GetNBlocks() - 1, kb.s,
+      kb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        if (pack.GetLowerBound(b, var()) <= pack.GetUpperBound(b, var())) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            pack.flux(b, edge, var(), k, j, i) = 0.0;
+            for (int dk = -koff; dk < 1; ++dk)
+              for (int dj = -joff; dj < 1; ++dj)
+                for (int di = -ioff; di < 1; ++di) {
+                  // TODO(LFR): Pick out the correct component of the reconstructed flux,
+                  // current version is not an issue for piecewise constant flux though.
+                  pack.flux(b, edge, var(), k, j, i) +=
+                      pack(b, TE::CC, recon(0), k + dk, j + dj, i + di);
+                }
+            pack.flux(b, edge, var(), k, j, i) /= npoints;
+            pack.flux(b, edge, var(), k, j, i) *= fac;
+          });
+        }
       });
 
   // 3. Reconstruct the transverse components of the advected field at the edge
@@ -167,13 +191,21 @@ TaskStatus CalculateVectorFluxes(parthenon::TopologicalElement edge,
     ib = md->GetBoundsI(cl, IndexDomain::interior, f);
     jb = md->GetBoundsJ(cl, IndexDomain::interior, f);
     kb = md->GetBoundsK(cl, IndexDomain::interior, f);
-    parthenon::par_for(
-        PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s - (ndim > 2),
-        kb.e + (ndim > 2), jb.s - (ndim > 1), jb.e + (ndim > 1), ib.s - 1, ib.e + 1,
-        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-          // Piecewise linear in the orthogonal directions, could do something better here
-          pack(b, f, recon_f(0), k, j, i) = pack(b, f, var(), k, j, i);
-          pack(b, f, recon_f(1), k, j, i) = pack(b, f, var(), k, j, i);
+    parthenon::par_for_outer(
+        PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack.GetNBlocks() - 1,
+        kb.s - (ndim > 2), kb.e + (ndim > 2),
+        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+          parthenon::Indexer2D idxer({jb.s - (ndim > 1), jb.e + (ndim > 1)},
+                                     {ib.s - (ndim > 0), ib.e + (ndim > 0)});
+          if (pack.GetLowerBound(b, var()) <= pack.GetUpperBound(b, var())) {
+            parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+              const auto [j, i] = idxer(idx);
+              // Piecewise linear in the orthogonal directions, could do something better
+              // here
+              pack(b, f, recon_f(0), k, j, i) = pack(b, f, var(), k, j, i);
+              pack(b, f, recon_f(1), k, j, i) = pack(b, f, var(), k, j, i);
+            });
+          }
         });
   }
 
@@ -189,15 +221,21 @@ TaskStatus CalculateVectorFluxes(parthenon::TopologicalElement edge,
   d1[static_cast<int>(f1) % 3] = 0;
   d2[static_cast<int>(edge) % 3] = 0;
   d2[static_cast<int>(f2) % 3] = 0;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        pack.flux(b, edge, var(), k, j, i) +=
-            0.5 * (pack(b, f1, recon_f(0), k, j, i) -
-                   pack(b, f1, recon_f(1), k - d1[2], j - d1[1], i - d1[0]));
-        pack.flux(b, edge, var(), k, j, i) -=
-            0.5 * (pack(b, f2, recon_f(0), k, j, i) -
-                   pack(b, f2, recon_f(1), k - d2[2], j - d2[1], i - d2[0]));
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack.GetNBlocks() - 1, kb.s,
+      kb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        if (pack.GetLowerBound(b, var()) <= pack.GetUpperBound(b, var())) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            pack.flux(b, edge, var(), k, j, i) +=
+                0.5 * (pack(b, f1, recon_f(0), k, j, i) -
+                       pack(b, f1, recon_f(1), k - d1[2], j - d1[1], i - d1[0]));
+            pack.flux(b, edge, var(), k, j, i) -=
+                0.5 * (pack(b, f2, recon_f(0), k, j, i) -
+                       pack(b, f2, recon_f(1), k - d2[2], j - d2[1], i - d2[0]));
+          });
+        }
       });
 
   // Add in the diffusive component
diff --git a/example/fine_advection/parthenon_app_inputs.cpp b/example/fine_advection/parthenon_app_inputs.cpp
index ab970175e9cf..8025970f35ca 100644
--- a/example/fine_advection/parthenon_app_inputs.cpp
+++ b/example/fine_advection/parthenon_app_inputs.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2021. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -50,6 +50,9 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
 
   auto pkg = pmb->packages.Get("advection_package");
   const auto &profile = pkg->Param<std::string>("profile");
+  auto do_regular_advection = pkg->Param<bool>("do_regular_advection");
+  auto do_fine_advection = pkg->Param<bool>("do_fine_advection");
+  auto do_CT_advection = pkg->Param<bool>("do_CT_advection");
 
   auto cellbounds = pmb->cellbounds;
   IndexRange ib = cellbounds.GetBoundsI(IndexDomain::interior);
@@ -59,6 +62,11 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   auto coords = pmb->coords;
 
   using namespace advection_package::Conserved;
+  if (do_regular_advection) {
+    const int sparse_size = pkg->Param<int>("sparse_size");
+    for (int s = 0; s < sparse_size; ++s)
+      pmb->AllocSparseID(phi::name(), s);
+  }
   static auto desc = parthenon::MakePackDescriptor<phi, phi_fine, C, D>(data.get());
   auto pack = desc.GetPack(data.get());
 
@@ -68,78 +76,105 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   if (profile == "hard_sphere") profile_type = 2;
   if (profile == "block") profile_type = 3;
   Real amp = 1.0;
-
   const int b = 0;
   const int ndim = pmb->pmy_mesh->ndim;
   const int nghost = parthenon::Globals::nghost;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int k, const int j, const int i) {
-        const int kf = ndim > 2 ? (k - nghost) * 2 + nghost : k;
-        const int jf = ndim > 1 ? (j - nghost) * 2 + nghost : j;
-        const int fi = ndim > 0 ? (i - nghost) * 2 + nghost : i;
-        if (profile_type == 0) {
-          PARTHENON_FAIL("Profile type wave not implemented.");
-        } else if (profile_type == 1) {
-          Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
-                     coords.Xc<2>(j) * coords.Xc<2>(j) +
-                     coords.Xc<3>(k) * coords.Xc<3>(k);
-          pack(b, phi(), k, j, i) = 1. + amp * exp(-100.0 * rsq);
-          for (int ioff = 0; ioff <= (ndim > 0); ++ioff)
-            for (int joff = 0; joff <= (ndim > 1); ++joff)
-              for (int koff = 0; koff <= (ndim > 2); ++koff) {
-                pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) =
-                    1. + amp * exp(-100.0 * rsq);
-              }
-        } else if (profile_type == 2) {
-          Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
-                     coords.Xc<2>(j) * coords.Xc<2>(j) +
-                     coords.Xc<3>(k) * coords.Xc<3>(k);
-          pack(b, phi(), k, j, i) = (rsq < 0.15 * 0.15 ? 1.0 : 0.0);
-          for (int ioff = 0; ioff <= (ndim > 0); ++ioff)
-            for (int joff = 0; joff <= (ndim > 1); ++joff)
-              for (int koff = 0; koff <= (ndim > 2); ++koff) {
-                pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) =
-                    (rsq < 0.15 * 0.15 ? 1.0 : 0.0);
-              }
-        } else {
-          pack(b, phi(), k, j, i) = 0.0;
-          for (int ioff = 0; ioff <= (ndim > 0); ++ioff)
-            for (int joff = 0; joff <= (ndim > 1); ++joff)
-              for (int koff = 0; koff <= (ndim > 2); ++koff) {
-                pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) = 0.0;
-              }
-        }
-      });
-
-  ib = cellbounds.GetBoundsI(IndexDomain::interior, TE::F2);
-  jb = cellbounds.GetBoundsJ(IndexDomain::interior, TE::F2);
-  kb = cellbounds.GetBoundsK(IndexDomain::interior, TE::F2);
-  const Real x0 = 0.2;
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int k, const int j, const int i) {
-        auto &coords = pack.GetCoordinates(b);
-        Real xlo = coords.X<X1DIR, TE::F1>(k, j, i);
-        Real xhi = coords.X<X1DIR, TE::F1>(k, j, i + 1);
-        Real Alo = std::abs(xlo) < x0 ? sign(xlo, xhi) * (x0 - std::abs(xlo)) : 0.0;
-        Real Ahi = std::abs(xhi) < x0 ? sign(xhi, xlo) * (x0 - std::abs(xhi)) : 0.0;
-        pack(b, TE::F2, C(), k, j, i) = (Alo - Ahi) / (xhi - xlo);
-      });
-
-  ib = cellbounds.GetBoundsI(IndexDomain::interior, TE::F3);
-  jb = cellbounds.GetBoundsJ(IndexDomain::interior, TE::F3);
-  kb = cellbounds.GetBoundsK(IndexDomain::interior, TE::F3);
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int k, const int j, const int i) {
-        auto &coords = pack.GetCoordinates(b);
-        Real xlo = coords.X<X1DIR, TE::F1>(k, j, i);
-        Real xhi = coords.X<X1DIR, TE::F1>(k, j, i + 1);
-        Real Alo = std::abs(xlo) < x0 ? sign(xlo, xhi) * (x0 - std::abs(xlo)) : 0.0;
-        Real Ahi = std::abs(xhi) < x0 ? sign(xhi, xlo) * (x0 - std::abs(xhi)) : 0.0;
-        pack(b, TE::F3, D(), k, j, i) = (Alo - Ahi) / (xhi - xlo);
-      });
+
+  if (do_regular_advection) {
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, pack.GetLowerBoundHost(b, phi()),
+        pack.GetUpperBoundHost(b, phi()), kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
+          const int kf = ndim > 2 ? (k - nghost) * 2 + nghost : k;
+          const int jf = ndim > 1 ? (j - nghost) * 2 + nghost : j;
+          const int fi = ndim > 0 ? (i - nghost) * 2 + nghost : i;
+          if (profile_type == 0) {
+            PARTHENON_FAIL("Profile type wave not implemented.");
+          } else if (profile_type == 1) {
+            Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
+                       coords.Xc<2>(j) * coords.Xc<2>(j) +
+                       coords.Xc<3>(k) * coords.Xc<3>(k);
+            pack(b, phi(l), k, j, i) = 1. + amp * exp(-100.0 * rsq);
+          } else if (profile_type == 2) {
+            Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
+                       coords.Xc<2>(j) * coords.Xc<2>(j) +
+                       coords.Xc<3>(k) * coords.Xc<3>(k);
+            pack(b, phi(l), k, j, i) = (rsq < 0.15 * 0.15 ? 1.0 : 0.0);
+          } else {
+            pack(b, phi(l), k, j, i) = 0.0;
+          }
+        });
+  }
+
+  if (do_fine_advection) {
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int k, const int j, const int i) {
+          const int kf = ndim > 2 ? (k - nghost) * 2 + nghost : k;
+          const int jf = ndim > 1 ? (j - nghost) * 2 + nghost : j;
+          const int fi = ndim > 0 ? (i - nghost) * 2 + nghost : i;
+          if (profile_type == 0) {
+            PARTHENON_FAIL("Profile type wave not implemented.");
+          } else if (profile_type == 1) {
+            Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
+                       coords.Xc<2>(j) * coords.Xc<2>(j) +
+                       coords.Xc<3>(k) * coords.Xc<3>(k);
+            for (int koff = 0; koff <= (ndim > 2); ++koff)
+              for (int joff = 0; joff <= (ndim > 1); ++joff)
+                for (int ioff = 0; ioff <= (ndim > 0); ++ioff) {
+                  pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) =
+                      1. + amp * exp(-100.0 * rsq);
+                }
+          } else if (profile_type == 2) {
+            Real rsq = coords.Xc<1>(i) * coords.Xc<1>(i) +
+                       coords.Xc<2>(j) * coords.Xc<2>(j) +
+                       coords.Xc<3>(k) * coords.Xc<3>(k);
+            for (int koff = 0; koff <= (ndim > 2); ++koff)
+              for (int joff = 0; joff <= (ndim > 1); ++joff)
+                for (int ioff = 0; ioff <= (ndim > 0); ++ioff) {
+                  pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) =
+                      (rsq < 0.15 * 0.15 ? 1.0 : 0.0);
+                }
+          } else {
+            for (int koff = 0; koff <= (ndim > 2); ++koff)
+              for (int joff = 0; joff <= (ndim > 1); ++joff)
+                for (int ioff = 0; ioff <= (ndim > 0); ++ioff) {
+                  pack(b, phi_fine(), kf + koff, jf + joff, fi + ioff) = 0.0;
+                }
+          }
+        });
+  }
+
+  if (do_CT_advection) {
+    ib = cellbounds.GetBoundsI(IndexDomain::interior, TE::F2);
+    jb = cellbounds.GetBoundsJ(IndexDomain::interior, TE::F2);
+    kb = cellbounds.GetBoundsK(IndexDomain::interior, TE::F2);
+    const Real x0 = 0.2;
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int k, const int j, const int i) {
+          auto &coords = pack.GetCoordinates(b);
+          Real xlo = coords.X<X1DIR, TE::F1>(k, j, i);
+          Real xhi = coords.X<X1DIR, TE::F1>(k, j, i + 1);
+          Real Alo = std::abs(xlo) < x0 ? sign(xlo, xhi) * (x0 - std::abs(xlo)) : 0.0;
+          Real Ahi = std::abs(xhi) < x0 ? sign(xhi, xlo) * (x0 - std::abs(xhi)) : 0.0;
+          pack(b, TE::F2, C(), k, j, i) = (Alo - Ahi) / (xhi - xlo);
+        });
+
+    ib = cellbounds.GetBoundsI(IndexDomain::interior, TE::F3);
+    jb = cellbounds.GetBoundsJ(IndexDomain::interior, TE::F3);
+    kb = cellbounds.GetBoundsK(IndexDomain::interior, TE::F3);
+    parthenon::par_for(
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int k, const int j, const int i) {
+          auto &coords = pack.GetCoordinates(b);
+          Real xlo = coords.X<X1DIR, TE::F1>(k, j, i);
+          Real xhi = coords.X<X1DIR, TE::F1>(k, j, i + 1);
+          Real Alo = std::abs(xlo) < x0 ? sign(xlo, xhi) * (x0 - std::abs(xlo)) : 0.0;
+          Real Ahi = std::abs(xhi) < x0 ? sign(xhi, xlo) * (x0 - std::abs(xhi)) : 0.0;
+          pack(b, TE::F3, D(), k, j, i) = (Alo - Ahi) / (xhi - xlo);
+        });
+  }
 }
 
 Packages_t ProcessPackages(std::unique_ptr<ParameterInput> &pin) {
diff --git a/example/fine_advection/parthinput.advection b/example/fine_advection/parthinput.advection
index 31ee255c5053..b3bf7ca10d0c 100644
--- a/example/fine_advection/parthinput.advection
+++ b/example/fine_advection/parthinput.advection
@@ -57,6 +57,13 @@ profile = hard_sphere
 refine_tol = 0.3    # control the package specific refinement tagging function
 derefine_tol = 0.03
 
+# Include advection equation for regular resolution CC fields
+do_regular_advection = true
+# Include advection equation for fine resolution CC fields
+do_fine_advection = true
+# Include vector wave equation using CT
+do_CT_advection = true
+
 <parthenon/output1>
 file_type = rst
 dt = 0.05
diff --git a/example/fine_advection/stokes.hpp b/example/fine_advection/stokes.hpp
index 27b529def8a5..fdc3ec408402 100644
--- a/example/fine_advection/stokes.hpp
+++ b/example/fine_advection/stokes.hpp
@@ -19,6 +19,7 @@
 
 #include <parthenon/driver.hpp>
 #include <parthenon/package.hpp>
+#include <utils/indexer.hpp>
 
 namespace advection_example {
 using namespace parthenon::driver::prelude;
@@ -36,21 +37,27 @@ TaskStatus WeightedSumDataElement(parthenon::CellLevel cl,
   IndexRange jb = in1->GetBoundsJ(cl, IndexDomain::entire, te);
   IndexRange kb = in1->GetBoundsK(cl, IndexDomain::entire, te);
 
-  PARTHENON_REQUIRE(pack1.GetLowerBoundHost(0) == pack2.GetLowerBoundHost(0),
-                    "Packs are different size.");
-  PARTHENON_REQUIRE(pack1.GetLowerBoundHost(0) == pack_out.GetLowerBoundHost(0),
-                    "Packs are different size.");
-  PARTHENON_REQUIRE(pack1.GetUpperBoundHost(0) == pack2.GetUpperBoundHost(0),
-                    "Packs are different size.");
-  PARTHENON_REQUIRE(pack1.GetUpperBoundHost(0) == pack_out.GetUpperBoundHost(0),
-                    "Packs are different size.");
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack1.GetNBlocks() - 1, pack1.GetLowerBoundHost(0),
-      pack1.GetUpperBoundHost(0), // This is safe for dense vars
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
-        pack_out(b, te, l, k, j, i) =
-            w1 * pack1(b, te, l, k, j, i) + w2 * pack2(b, te, l, k, j, i);
+  constexpr int scratch_size = 0;
+  constexpr int scratch_level = 1;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack1.GetNBlocks() - 1, kb.s,
+      kb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        PARTHENON_REQUIRE(pack1.GetLowerBound(b) == pack2.GetLowerBound(b),
+                          "Packs are different size.");
+        PARTHENON_REQUIRE(pack1.GetLowerBound(b) == pack_out.GetLowerBound(b),
+                          "Packs are different size.");
+        PARTHENON_REQUIRE(pack1.GetUpperBound(b) == pack2.GetUpperBound(b),
+                          "Packs are different size.");
+        PARTHENON_REQUIRE(pack1.GetUpperBound(b) == pack_out.GetUpperBound(b),
+                          "Packs are different size.");
+        for (int l = pack1.GetLowerBound(b); l <= pack1.GetUpperBound(b); ++l) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            pack_out(b, te, l, k, j, i) =
+                w1 * pack1(b, te, l, k, j, i) + w2 * pack2(b, te, l, k, j, i);
+          });
+        }
       });
   return TaskStatus::complete;
 }
@@ -73,12 +80,18 @@ void StokesZero(parthenon::CellLevel cl, parthenon::TopologicalElement TeVar,
   IndexRange jb = out->GetBoundsJ(cl, IndexDomain::interior, TeVar);
   IndexRange kb = out->GetBoundsK(cl, IndexDomain::interior, TeVar);
 
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack_out.GetNBlocks() - 1, pack_out.GetLowerBoundHost(0),
-      pack_out.GetUpperBoundHost(0), // This is safe for dense vars only
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
-        pack_out(b, TeVar, l, k, j, i) = 0.0;
+  constexpr int scratch_size = 0;
+  constexpr int scratch_level = 1;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack_out.GetNBlocks() - 1,
+      kb.s, kb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        for (int l = pack_out.GetLowerBound(b); l <= pack_out.GetUpperBound(b); ++l) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            pack_out(b, TeVar, l, k, j, i) = 0.0;
+          });
+        }
       });
 }
 
@@ -103,22 +116,29 @@ void StokesComponent(Real fac, parthenon::CellLevel cl,
   koff = ndim > 2 ? koff : 0;
   joff = ndim > 1 ? joff : 0;
 
-  PARTHENON_REQUIRE(pack_in.GetLowerBoundHost(0) == pack_out.GetLowerBoundHost(0),
-                    "Packs are different size.");
-  PARTHENON_REQUIRE(pack_in.GetUpperBoundHost(0) == pack_out.GetUpperBoundHost(0),
-                    "Packs are different size.");
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, pack_in.GetNBlocks() - 1, pack_in.GetLowerBoundHost(0),
-      pack_in.GetUpperBoundHost(0), // This is safe for dense vars only
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
+  constexpr int scratch_size = 0;
+  constexpr int scratch_level = 1;
+  parthenon::par_for_outer(
+      PARTHENON_AUTO_LABEL, scratch_size, scratch_level, 0, pack_out.GetNBlocks() - 1,
+      kb.s, kb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
         auto &coords = pack_in.GetCoordinates(b);
-        pack_out(b, TeVar, l, k, j, i) +=
-            fac *
-            (coords.Volume(cl, TeFlux, k, j, i) * pack_in.flux(b, TeFlux, l, k, j, i) -
-             coords.Volume(cl, TeFlux, k + koff, j + joff, i + ioff) *
-                 pack_in.flux(b, TeFlux, l, k + koff, j + joff, i + ioff)) /
-            coords.Volume(cl, TeVar, k, j, i);
+        parthenon::Indexer2D idxer({jb.s, jb.e}, {ib.s, ib.e});
+        PARTHENON_REQUIRE(pack_in.GetLowerBound(b) == pack_out.GetLowerBound(b),
+                          "Packs are different size.");
+        PARTHENON_REQUIRE(pack_in.GetUpperBound(b) == pack_out.GetUpperBound(b),
+                          "Packs are different size.");
+        for (int l = pack_out.GetLowerBound(b); l <= pack_out.GetUpperBound(b); ++l) {
+          parthenon::par_for_inner(member, 0, idxer.size() - 1, [&](const int idx) {
+            const auto [j, i] = idxer(idx);
+            pack_out(b, TeVar, l, k, j, i) +=
+                fac *
+                (coords.Volume(cl, TeFlux, k, j, i) *
+                     pack_in.flux(b, TeFlux, l, k, j, i) -
+                 coords.Volume(cl, TeFlux, k + koff, j + joff, i + ioff) *
+                     pack_in.flux(b, TeFlux, l, k + koff, j + joff, i + ioff)) /
+                coords.Volume(cl, TeVar, k, j, i);
+          });
+        }
       });
 }
 
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
index a01a121f75fa..0f250e9ab02f 100644
--- a/example/particles/particles.cpp
+++ b/example/particles/particles.cpp
@@ -288,17 +288,18 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
           } while (r > 0.5);
 
           // Randomly sample direction perpendicular to origin
-          Real theta = acos(2. * rng_gen.drand() - 1.);
-          Real phi = 2. * M_PI * rng_gen.drand();
-          v(0, n) = sin(theta) * cos(phi);
-          v(1, n) = sin(theta) * sin(phi);
-          v(2, n) = cos(theta);
+          const Real mu = 2.0 * rng_gen.drand() - 1.0;
+          const Real phi = 2. * M_PI * rng_gen.drand();
+          const Real stheta = std::sqrt(1.0 - mu * mu);
+          v(0, n) = stheta * cos(phi);
+          v(1, n) = stheta * sin(phi);
+          v(2, n) = mu;
           // Project v onto plane normal to sphere
           Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
-          Real NdN = r * r;
-          v(0, n) = v(0, n) - vdN / NdN * x(n);
-          v(1, n) = v(1, n) - vdN / NdN * y(n);
-          v(2, n) = v(2, n) - vdN / NdN * z(n);
+          Real inverse_NdN = 1. / (r * r);
+          v(0, n) = v(0, n) - vdN * inverse_NdN * x(n);
+          v(1, n) = v(1, n) - vdN * inverse_NdN * y(n);
+          v(2, n) = v(2, n) - vdN * inverse_NdN * z(n);
 
           // Normalize
           Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
@@ -335,11 +336,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
           z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
 
           // Randomly sample direction on the unit sphere, fixing speed
-          Real theta = acos(2. * rng_gen.drand() - 1.);
-          Real phi = 2. * M_PI * rng_gen.drand();
-          v(0, n) = vel * sin(theta) * cos(phi);
-          v(1, n) = vel * sin(theta) * sin(phi);
-          v(2, n) = vel * cos(theta);
+          const Real mu = 2.0 * rng_gen.drand() - 1.0;
+          const Real phi = 2. * M_PI * rng_gen.drand();
+          const Real stheta = std::sqrt(1.0 - mu * mu);
+          v(0, n) = vel * stheta * cos(phi);
+          v(1, n) = vel * stheta * sin(phi);
+          v(2, n) = vel * mu;
 
           // Create particles at the beginning of the timestep
           t(n) = t0;
diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp
index 294338dee0d2..39f1e8a28123 100644
--- a/src/interface/swarm.cpp
+++ b/src/interface/swarm.cpp
@@ -33,6 +33,7 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {
   context.block_index_ = block_index_;
   context.neighbor_indices_ = neighbor_indices_;
   context.cell_sorted_ = cell_sorted_;
+  context.buffer_sorted_ = buffer_sorted_;
   context.cell_sorted_begin_ = cell_sorted_begin_;
   context.cell_sorted_number_ = cell_sorted_number_;
 
@@ -73,9 +74,10 @@ Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_
       new_indices_("new_indices_", nmax_pool_), scratch_a_("scratch_a_", nmax_pool_),
       scratch_b_("scratch_b_", nmax_pool_),
       num_particles_to_send_("num_particles_to_send_", NMAX_NEIGHBORS),
-      buffer_counters_("buffer_counters_", NMAX_NEIGHBORS),
+      buffer_start_("buffer_start_", NMAX_NEIGHBORS),
       neighbor_received_particles_("neighbor_received_particles_", NMAX_NEIGHBORS),
-      cell_sorted_("cell_sorted_", nmax_pool_), mpiStatus(true) {
+      cell_sorted_("cell_sorted_", nmax_pool_),
+      buffer_sorted_("buffer_sorted_", nmax_pool_), mpiStatus(true) {
   PARTHENON_REQUIRE_THROWS(typeid(Coordinates_t) == typeid(UniformCartesian),
                            "SwarmDeviceContext only supports a uniform Cartesian mesh!");
 
@@ -209,6 +211,9 @@ void Swarm::SetPoolMax(const std::int64_t nmax_pool) {
   Kokkos::resize(cell_sorted_, nmax_pool);
   pmb->LogMemUsage(n_new * sizeof(SwarmKey));
 
+  Kokkos::resize(buffer_sorted_, nmax_pool);
+  pmb->LogMemUsage(n_new * sizeof(SwarmKey));
+
   block_index_.Resize(nmax_pool);
   pmb->LogMemUsage(n_new * sizeof(int));
 
@@ -490,35 +495,35 @@ void Swarm::SortParticlesByCell() {
             break;
           }
 
-          if (cell_sorted(start_index).cell_idx_1d_ == cell_idx_1d) {
+          if (cell_sorted(start_index).sort_idx_ == cell_idx_1d) {
             if (start_index == 0) {
               break;
-            } else if (cell_sorted(start_index - 1).cell_idx_1d_ != cell_idx_1d) {
+            } else if (cell_sorted(start_index - 1).sort_idx_ != cell_idx_1d) {
               break;
             } else {
               start_index--;
               continue;
             }
           }
-          if (cell_sorted(start_index).cell_idx_1d_ >= cell_idx_1d) {
+          if (cell_sorted(start_index).sort_idx_ >= cell_idx_1d) {
             start_index--;
             if (start_index < 0) {
               start_index = -1;
               break;
             }
-            if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
+            if (cell_sorted(start_index).sort_idx_ < cell_idx_1d) {
               start_index = -1;
               break;
             }
             continue;
           }
-          if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
+          if (cell_sorted(start_index).sort_idx_ < cell_idx_1d) {
             start_index++;
             if (start_index > max_active_index) {
               start_index = -1;
               break;
             }
-            if (cell_sorted(start_index).cell_idx_1d_ > cell_idx_1d) {
+            if (cell_sorted(start_index).sort_idx_ > cell_idx_1d) {
               start_index = -1;
               break;
             }
@@ -532,7 +537,7 @@ void Swarm::SortParticlesByCell() {
           int number = 0;
           int current_index = start_index;
           while (current_index <= max_active_index &&
-                 cell_sorted(current_index).cell_idx_1d_ == cell_idx_1d) {
+                 cell_sorted(current_index).sort_idx_ == cell_idx_1d) {
             current_index++;
             number++;
             cell_sorted_number(k, j, i) = number;
diff --git a/src/interface/swarm.hpp b/src/interface/swarm.hpp
index 2058dc58f894..49cd906c2a15 100644
--- a/src/interface/swarm.hpp
+++ b/src/interface/swarm.hpp
@@ -289,7 +289,7 @@ class Swarm {
   constexpr static int unset_index_ = -1;
 
   ParArray1D<int> num_particles_to_send_;
-  ParArray1D<int> buffer_counters_;
+  ParArray1D<int> buffer_start_;
   ParArray1D<int> neighbor_received_particles_;
   int total_received_particles_;
 
@@ -298,6 +298,9 @@ class Swarm {
   ParArray1D<SwarmKey>
       cell_sorted_; // 1D per-cell sorted array of key-value swarm memory indices
 
+  ParArray1D<SwarmKey>
+      buffer_sorted_; // 1D per-buffer sorted array of key-value swarm memory indices
+
   ParArrayND<int>
       cell_sorted_begin_; // Per-cell array of starting indices in cell_sorted_
 
diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
index 3ee4e2af7512..054ded34fabb 100644
--- a/src/interface/swarm_comms.cpp
+++ b/src/interface/swarm_comms.cpp
@@ -156,72 +156,14 @@ void Swarm::SetupPersistentMPI() {
   }
 }
 
-void Swarm::CountParticlesToSend_() {
-  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-  auto swarm_d = GetDeviceContext();
-  auto pmb = GetBlockPointer();
-  const int nbmax = vbswarm->bd_var_.nbmax;
-
-  // Fence to make sure particles aren't currently being transported locally
-  // TODO(BRR) do this operation on device.
-  pmb->exec_space.fence();
-  const int particle_size = GetParticleDataSize();
-  vbswarm->particle_size = particle_size;
-
-  // TODO(BRR) This kernel launch should be folded into the subsequent logic once we
-  // convert that to kernel-based reductions
-  auto &x = Get<Real>(swarm_position::x::name()).Get();
-  auto &y = Get<Real>(swarm_position::y::name()).Get();
-  auto &z = Get<Real>(swarm_position::z::name()).Get();
-  const int max_active_index = GetMaxActiveIndex();
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
-        if (swarm_d.IsActive(n)) {
-          bool on_current_mesh_block = true;
-          swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
-        }
-      });
-
-  // Facilitate lambda captures
-  auto &block_index = block_index_;
-  auto &num_particles_to_send = num_particles_to_send_;
-
-  // Zero out number of particles to send before accumulating
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
-      KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; });
-
-  parthenon::par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
-        if (swarm_d.IsActive(n)) {
-          bool on_current_mesh_block = true;
-          swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
-
-          if (block_index(n) >= 0) {
-            Kokkos::atomic_add(&num_particles_to_send(block_index(n)), 1);
-          }
-        }
-      });
-
-  auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
-
-  // Resize send buffers if too small
-  for (int n = 0; n < pmb->neighbors.size(); n++) {
-    const int bufid = pmb->neighbors[n].bufid;
-    auto sendbuf = vbswarm->bd_var_.send[bufid];
-    if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
-      sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
-      vbswarm->bd_var_.send[bufid] = sendbuf;
-    }
-    vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
-  }
-}
-
 void Swarm::LoadBuffers_() {
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
   const int particle_size = GetParticleDataSize();
+  vbswarm->particle_size = particle_size;
   const int nneighbor = pmb->neighbors.size();
+  // Fence to make sure particles aren't currently being transported locally
+  pmb->exec_space.fence();
 
   auto &int_vector = std::get<getType<int>()>(vectors_);
   auto &real_vector = std::get<getType<Real>()>(vectors_);
@@ -236,42 +178,103 @@ void Swarm::LoadBuffers_() {
   auto &y = Get<Real>(swarm_position::y::name()).Get();
   auto &z = Get<Real>(swarm_position::z::name()).Get();
 
-  // Zero buffer index counters
-  auto &buffer_counters = buffer_counters_;
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
-      KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; });
+  if (max_active_index_ >= 0) {
+    auto &buffer_sorted = buffer_sorted_;
+    auto &buffer_start = buffer_start_;
 
-  auto &bdvar = vbswarm->bd_var_;
-  auto neighbor_buffer_index = neighbor_buffer_index_;
-  // Loop over active particles and use atomic operations to find indices into buffers if
-  // this particle will be sent.
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
-        if (swarm_d.IsActive(n)) {
-          bool on_current_mesh_block = true;
-          const int m =
-              swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
-          const int bufid = neighbor_buffer_index(m);
-
-          if (m >= 0) {
-            const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1);
-            int buffer_index = bid * particle_size;
-            swarm_d.MarkParticleForRemoval(n);
-            for (int i = 0; i < realPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = vreal(i, n);
-              buffer_index++;
-            }
-            for (int i = 0; i < intPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, n));
-              buffer_index++;
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          if (swarm_d.IsActive(n)) {
+            bool on_current_mesh_block = true;
+            const int m =
+                swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
+            buffer_sorted(n) = SwarmKey(m, n);
+          } else {
+            buffer_sorted(n) = SwarmKey(this_block_, n);
+          }
+        });
+
+    // sort by buffer index
+    sort(buffer_sorted, SwarmKeyComparator(), 0, max_active_index_);
+
+    // use discontinuity check to determine start of a buffer in buffer_sorted array
+    auto &num_particles_to_send = num_particles_to_send_;
+    auto max_active_index = max_active_index_;
+
+    // Zero out number of particles to send before accumulating
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1, KOKKOS_LAMBDA(const int n) {
+          num_particles_to_send[n] = 0;
+          buffer_start[n] = 0;
+        });
+
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          auto m = buffer_sorted(n).sort_idx_;
+          // start checks (used for index of particle in buffer)
+          if (m >= 0 && n == 0) {
+            buffer_start(m) = 0;
+          } else if (m >= 0 && m != buffer_sorted(n - 1).sort_idx_) {
+            buffer_start(m) = n;
+          }
+          // end checks (used to to size particle buffers)
+          if (m >= 0 && n == max_active_index) {
+            num_particles_to_send(m) = n + 1;
+          } else if (m >= 0 && m != buffer_sorted(n + 1).sort_idx_) {
+            num_particles_to_send(m) = n + 1;
+          }
+        });
+
+    // copy values back to host for buffer sizing
+    auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
+    auto buffer_start_h = buffer_start.GetHostMirrorAndCopy();
+
+    // Resize send buffers if too small
+    for (int n = 0; n < pmb->neighbors.size(); n++) {
+      num_particles_to_send_h(n) -= buffer_start_h(n);
+      const int bufid = pmb->neighbors[n].bufid;
+      auto sendbuf = vbswarm->bd_var_.send[bufid];
+      if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
+        sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
+        vbswarm->bd_var_.send[bufid] = sendbuf;
+      }
+      vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
+    }
+
+    auto &bdvar = vbswarm->bd_var_;
+    auto neighbor_buffer_index = neighbor_buffer_index_;
+    // Loop over active particles buffer_sorted, use index n and buffer_start to
+    /// get index in buffer m, pack that particle in buffer
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+          auto p_index = buffer_sorted(n).swarm_idx_;
+          if (swarm_d.IsActive(p_index)) {
+            const int m = buffer_sorted(n).sort_idx_;
+            const int bufid = neighbor_buffer_index(m);
+            if (m >= 0) {
+              const int bid = n - buffer_start[m];
+              int buffer_index = bid * particle_size;
+              swarm_d.MarkParticleForRemoval(p_index);
+              for (int i = 0; i < realPackDim; i++) {
+                bdvar.send[bufid](buffer_index) = vreal(i, p_index);
+                buffer_index++;
+              }
+              for (int i = 0; i < intPackDim; i++) {
+                bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, p_index));
+                buffer_index++;
+              }
             }
           }
-        }
-      });
+        });
 
-  // Remove particles that were loaded to send to another block from this block
-  RemoveMarkedParticles();
+    // Remove particles that were loaded to send to another block from this block
+    RemoveMarkedParticles();
+  } else {
+    for (int n = 0; n < pmb->neighbors.size(); n++) {
+      const int bufid = pmb->neighbors[n].bufid;
+      vbswarm->send_size[bufid] = 0;
+    }
+  }
 }
 
 void Swarm::Send(BoundaryCommSubset phase) {
@@ -279,10 +282,8 @@ void Swarm::Send(BoundaryCommSubset phase) {
   const int nneighbor = pmb->neighbors.size();
   auto swarm_d = GetDeviceContext();
 
-  // Query particles for those to be sent
-  CountParticlesToSend_();
-
-  // Prepare buffers for send operations
+  // Potentially resize buffer, get consistent index from particle array, get ready to
+  // send
   LoadBuffers_();
 
   // Send buffer data
diff --git a/src/interface/swarm_device_context.hpp b/src/interface/swarm_device_context.hpp
index 936d2d56ad35..b16daf0d0cdf 100644
--- a/src/interface/swarm_device_context.hpp
+++ b/src/interface/swarm_device_context.hpp
@@ -25,16 +25,16 @@ struct SwarmKey {
   SwarmKey() {}
   KOKKOS_INLINE_FUNCTION
   SwarmKey(const int cell_idx_1d, const int swarm_idx_1d)
-      : cell_idx_1d_(cell_idx_1d), swarm_idx_(swarm_idx_1d) {}
+      : sort_idx_(cell_idx_1d), swarm_idx_(swarm_idx_1d) {}
 
-  int cell_idx_1d_;
+  int sort_idx_;
   int swarm_idx_;
 };
 
 struct SwarmKeyComparator {
   KOKKOS_INLINE_FUNCTION
   bool operator()(const SwarmKey &s1, const SwarmKey &s2) {
-    return s1.cell_idx_1d_ < s2.cell_idx_1d_;
+    return s1.sort_idx_ < s2.sort_idx_;
   }
 };
 
@@ -139,6 +139,7 @@ class SwarmDeviceContext {
   ParArrayND<int> block_index_;
   ParArrayND<int> neighbor_indices_; // 4x4x4 array of possible block AMR regions
   ParArray1D<SwarmKey> cell_sorted_;
+  ParArray1D<SwarmKey> buffer_sorted_;
   ParArrayND<int> cell_sorted_begin_;
   ParArrayND<int> cell_sorted_number_;
   int ndim_;
diff --git a/src/outputs/output_utils.hpp b/src/outputs/output_utils.hpp
index 3b7e370cde9d..e9fcee04c986 100644
--- a/src/outputs/output_utils.hpp
+++ b/src/outputs/output_utils.hpp
@@ -50,7 +50,7 @@ namespace OutputUtils {
 // Helper struct containing some information about a variable
 struct VarInfo {
  public:
-  static constexpr int VNDIM = MAX_VARIABLE_DIMENSION;
+  static constexpr const int VNDIM = MAX_VARIABLE_DIMENSION;
   std::string label;
   int num_components;
   int tensor_rank; // 0- to 3-D for cell-centered variables, 0- to 6-D for arbitrary shape
diff --git a/src/outputs/restart_hdf5.cpp b/src/outputs/restart_hdf5.cpp
index cababddbfafe..8078fbd5fa9a 100644
--- a/src/outputs/restart_hdf5.cpp
+++ b/src/outputs/restart_hdf5.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2020-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2021. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -213,7 +213,7 @@ void RestartReaderHDF5::ReadBlocks(const std::string &name, IndexRange range,
 #else  // HDF5 enabled
   auto hdl = OpenDataset<Real>(name);
 
-  const int VNDIM = info.VNDIM;
+  constexpr int VNDIM = OutputUtils::VarInfo::VNDIM;
 
   /** Select hyperslab in dataset **/
   int total_dim = 0;
diff --git a/src/utils/hash.hpp b/src/utils/hash.hpp
index 77642a64dba3..4f1ff6557c74 100644
--- a/src/utils/hash.hpp
+++ b/src/utils/hash.hpp
@@ -31,9 +31,8 @@ std::size_t hash_combine(std::size_t lhs, const T &v, Rest &&...rest) {
   lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
   if constexpr (sizeof...(Rest) > 0) {
     return hash_combine(lhs, std::forward<Rest>(rest)...);
-  } else {
-    return lhs;
   }
+  return lhs;
 }
 
 template <class Tup, std::size_t I = std::tuple_size<Tup>::value - 1>
diff --git a/src/utils/sort.hpp b/src/utils/sort.hpp
index 97e9c77a88e4..a4ab8139dab3 100644
--- a/src/utils/sort.hpp
+++ b/src/utils/sort.hpp
@@ -61,7 +61,7 @@ void sort(ParArray1D<Key> data, KeyComparator comparator, size_t min_idx,
           size_t max_idx) {
   PARTHENON_DEBUG_REQUIRE(min_idx < data.extent(0), "Invalid minimum sort index!");
   PARTHENON_DEBUG_REQUIRE(max_idx < data.extent(0), "Invalid maximum sort index!");
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA)
 #ifdef __clang__
   PARTHENON_FAIL("sort is using thrust and there exists an incompatibility with clang, "
                  "see https://github.com/lanl/parthenon/issues/647 for more details. We "
@@ -74,6 +74,13 @@ void sort(ParArray1D<Key> data, KeyComparator comparator, size_t min_idx,
   thrust::device_ptr<Key> last_d = thrust::device_pointer_cast(data.data()) + max_idx + 1;
   thrust::sort(first_d, last_d, comparator);
 #endif
+#elif defined(KOKKOS_ENABLE_HIP)
+  auto data_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), data);
+  std::sort(data_h.data() + min_idx, data_h.data() + max_idx + 1, comparator);
+  Kokkos::deep_copy(data, data_h);
+  // TODO(BRR) With Kokkos 4.4, switch to Kokkos::sort
+  // auto sub_data = Kokkos::subview(data, std::make_pair(min_idx, max_idx + 1));
+  // Kokkos::sort(sub_data, comparator);
 #else
   if (std::is_same<DevExecSpace, HostExecSpace>::value) {
     std::sort(data.data() + min_idx, data.data() + max_idx + 1, comparator);
@@ -89,7 +96,7 @@ template <class Key>
 void sort(ParArray1D<Key> data, size_t min_idx, size_t max_idx) {
   PARTHENON_DEBUG_REQUIRE(min_idx < data.extent(0), "Invalid minimum sort index!");
   PARTHENON_DEBUG_REQUIRE(max_idx < data.extent(0), "Invalid maximum sort index!");
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
 #ifdef __clang__
   PARTHENON_FAIL("sort is using thrust and there exists an incompatibility with clang, "
                  "see https://github.com/lanl/parthenon/issues/647 for more details. We "
@@ -102,6 +109,12 @@ void sort(ParArray1D<Key> data, size_t min_idx, size_t max_idx) {
   thrust::device_ptr<Key> last_d = thrust::device_pointer_cast(data.data()) + max_idx + 1;
   thrust::sort(first_d, last_d);
 #endif
+  auto data_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), data);
+  std::sort(data_h.data() + min_idx, data_h.data() + max_idx + 1);
+  Kokkos::deep_copy(data, data_h);
+  // TODO(BRR) With Kokkos 4.4, switch to Kokkos::sort
+  // auto sub_data = Kokkos::subview(data, std::make_pair(min_idx, max_idx + 1));
+  // Kokkos::sort(sub_data);
 #else
   if (std::is_same<DevExecSpace, HostExecSpace>::value) {
     std::sort(data.data() + min_idx, data.data() + max_idx + 1);