diff --git a/.gitignore b/.gitignore
index 48439bce0..488cc1fd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.swp
 .#*
 /build*
+.vscode
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 389a6bcdf..15b7e5b7f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,3 +17,7 @@ if(Boost_FOUND)
   add_subdirectory(raytracing)
   add_subdirectory(brute_force)
 endif()
+
+if(ARBORX_ENABLE_MPI)
+  add_subdirectory(moving_least_squares)
+endif()
diff --git a/examples/moving_least_squares/CMakeLists.txt b/examples/moving_least_squares/CMakeLists.txt
new file mode 100644
index 000000000..a22bdbe30
--- /dev/null
+++ b/examples/moving_least_squares/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(ArborX_Example_MovingLeastSquares.exe moving_least_squares.cpp)
+target_link_libraries(ArborX_Example_MovingLeastSquares.exe ArborX::ArborX)
+add_test(NAME ArborX_Example_MovingLeastSquares COMMAND ArborX_Example_MovingLeastSquares.exe)
diff --git a/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
new file mode 100644
index 000000000..6c4e7ec34
--- /dev/null
+++ b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
@@ -0,0 +1,205 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <memory>
+#include <optional>
+
+#include <mpi.h>
+
+namespace Details
+{
+
+template <typename MemorySpace>
+class DistributedTreePostQueryComms
+{
+public:
+  DistributedTreePostQueryComms() = default;
+
+  template <typename ExecutionSpace, typename IndicesAndRanks>
+  DistributedTreePostQueryComms(MPI_Comm comm, ExecutionSpace const &space,
+                                IndicesAndRanks const &indices_and_ranks)
+  {
+    std::size_t data_len = indices_and_ranks.extent(0);
+
+    _comm.reset(
+        [comm]() {
+          auto p = new MPI_Comm;
+          MPI_Comm_dup(comm, p);
+          return p;
+        }(),
+        [](MPI_Comm *p) {
+          int mpi_finalized;
+          MPI_Finalized(&mpi_finalized);
+          if (!mpi_finalized)
+            MPI_Comm_free(p);
+          delete p;
+        });
+
+    int rank;
+    MPI_Comm_rank(*_comm, &rank);
+
+    Kokkos::View<int *, MemorySpace> mpi_tmp(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::DTPQC::tmp"),
+        data_len);
+
+    // Split indices/ranks
+    Kokkos::Array<Kokkos::View<int *, MemorySpace>, 2> split_indices_ranks =
+        indicesAndRanksSplit(space, indices_and_ranks, data_len);
+    Kokkos::View<int *, MemorySpace> indices = split_indices_ranks[0];
+    Kokkos::View<int *, MemorySpace> ranks = split_indices_ranks[1];
+
+    // Computes what will be common to every exchange. Every time
+    // someone wants to get the value from the same set of elements,
+    // they will use the same list of recv and send indices.
+    // The rank data will be saved inside the back distributor,
+    // as the front one is not relevant once the recv indices
+    // are computed.
+
+    // This builds for each process a local array indicating how much
+    // informatiom will be gathered
+    ArborX::Details::Distributor<MemorySpace> distributor_forth(*_comm);
+    _num_requests = distributor_forth.createFromSends(space, ranks);
+
+    // This creates the temporary buffer that will help when producing the
+    // array that rebuilds the output
+    Kokkos::View<int *, MemorySpace> mpi_rev_indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::rev_indices"),
+        _num_requests);
+    ArborX::iota(space, mpi_tmp);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, distributor_forth, mpi_tmp, mpi_rev_indices);
+
+    // This retrieves which source index a process wants and gives it to
+    // the process owning the source
+    _mpi_send_indices = Kokkos::View<int *, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::send_indices"),
+        _num_requests);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, distributor_forth, indices, _mpi_send_indices);
+
+    // This builds the temporary buffer that will create the reverse
+    // distributor to dispatch the values
+    Kokkos::View<int *, MemorySpace> mpi_rev_ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::rev_ranks"),
+        _num_requests);
+    Kokkos::deep_copy(space, mpi_tmp, rank);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, distributor_forth, mpi_tmp, mpi_rev_ranks);
+
+    // This will create the reverse of the previous distributor
+    _distributor = ArborX::Details::Distributor<MemorySpace>(*_comm);
+    _num_responses = _distributor->createFromSends(space, mpi_rev_ranks);
+
+    // There should be enough responses to perfectly fill what was requested
+    // i.e. _num_responses == data_len
+
+    // The we send back the requested indices so that each process can rebuild
+    // their output
+    _mpi_recv_indices = Kokkos::View<int *, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::recv_indices"),
+        _num_responses);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, *_distributor, mpi_rev_indices, _mpi_recv_indices);
+  }
+
+  template <typename ExecutionSpace, typename Values>
+  Kokkos::View<typename ArborX::Details::AccessTraitsHelper<
+                   ArborX::AccessTraits<Values, ArborX::PrimitivesTag>>::type *,
+               typename ArborX::AccessTraits<
+                   Values, ArborX::PrimitivesTag>::memory_space>
+  distribute(ExecutionSpace const &space, Values const &source)
+  {
+    using src_acc = ArborX::AccessTraits<Values, ArborX::PrimitivesTag>;
+    using value_t = typename ArborX::Details::AccessTraitsHelper<src_acc>::type;
+    using memory_space = typename src_acc::memory_space;
+
+    // We know what each process want so we prepare the data to be sent
+    Kokkos::View<value_t *, MemorySpace> data_to_send(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::data_to_send"),
+        _num_requests);
+    Kokkos::parallel_for(
+        "Example::DTPQC::data_to_send_fill",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
+        KOKKOS_CLASS_LAMBDA(int const i) {
+          data_to_send(i) = src_acc::get(source, _mpi_send_indices(i));
+        });
+
+    // We properly send the data, and each process has what it wants, but in the
+    // wrong order
+    Kokkos::View<value_t *, MemorySpace> data_to_recv(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::data_to_recv"),
+        _num_responses);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, *_distributor, data_to_send, data_to_recv);
+
+    // So we fix this by moving everything
+    Kokkos::View<value_t *, memory_space> output(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::output"),
+        _num_responses);
+    Kokkos::parallel_for(
+        "Example::DTPQC::output_fill",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_responses),
+        KOKKOS_CLASS_LAMBDA(int const i) {
+          output(_mpi_recv_indices(i)) = data_to_recv(i);
+        });
+
+    return output;
+  }
+
+  template <typename ExecutionSpace, typename IndicesAndRanks>
+  static Kokkos::Array<Kokkos::View<int *, MemorySpace>, 2>
+  indicesAndRanksSplit(ExecutionSpace const &space,
+                       IndicesAndRanks const &indices_and_ranks,
+                       std::size_t data_len)
+  {
+    Kokkos::View<int *, MemorySpace> indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::indices"),
+        data_len);
+    Kokkos::View<int *, MemorySpace> ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::ranks"),
+        data_len);
+
+    Kokkos::parallel_for(
+        "Example::DTPQC::indices_and_ranks_split",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, data_len),
+        KOKKOS_LAMBDA(int const i) {
+          indices(i) = indices_and_ranks(i).index;
+          ranks(i) = indices_and_ranks(i).rank;
+        });
+
+    return {{indices, ranks}};
+  }
+
+private:
+  std::shared_ptr<MPI_Comm> _comm;
+  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
+  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
+  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor;
+  std::size_t _num_requests;
+  std::size_t _num_responses;
+};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
new file mode 100644
index 000000000..f3dad773b
--- /dev/null
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -0,0 +1,328 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include "DetailsPolynomialBasis.hpp"
+#include "DetailsSymmetricPseudoInverseSVD.hpp"
+
+namespace Details
+{
+
+template <typename Point>
+using PointEquivalence = ArborX::ExperimentalHyperGeometry::Point<
+    ArborX::GeometryTraits::dimension_v<Point>,
+    typename ArborX::GeometryTraits::coordinate_type<Point>::type>;
+
+template <typename Points>
+using PointEquivalenceFromAT =
+    PointEquivalence<typename ArborX::Details::AccessTraitsHelper<
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type>;
+
+template <typename MemorySpace, typename CoefficientType>
+class MovingLeastSquaresComputation
+{
+public:
+  MovingLeastSquaresComputation() = default;
+
+  template <typename ExecutionSpace, typename PolynomialDegree,
+            typename RadialBasisFunction, typename SourcePoints,
+            typename TargetPoints>
+  MovingLeastSquaresComputation(ExecutionSpace const &space,
+                                SourcePoints const &source_points,
+                                TargetPoints const &target_points,
+                                PolynomialDegree const &pd,
+                                RadialBasisFunction const &rbf)
+  {
+    using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
+    using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+    using point_t = PointEquivalenceFromAT<SourcePoints>;
+
+    _num_targets = tgt_acc::size(target_points);
+    _num_neighbors = src_acc::size(source_points) / _num_targets;
+
+    static constexpr std::size_t polynomialBasisSize =
+        polynomialBasisSizeFromAT<SourcePoints, PolynomialDegree::value>;
+
+    // We center each group of points around the target as it ables us to
+    // optimize the final computation and transfer point types into ours
+    // TODO: Use multidimensional points!
+    Kokkos::View<point_t **, MemorySpace> source_ref_target =
+        sourceRefTargetFill(space, source_points, target_points, _num_targets,
+                            _num_neighbors);
+
+    // To properly use the RBF, we need to decide for a radius around each
+    // target point that encapsulates all of the points
+    Kokkos::View<CoefficientType *, MemorySpace> radii = radiiComputation(
+        space, source_ref_target, _num_targets, _num_neighbors);
+
+    // Once the radius is computed, the wieght follows by evaluating the RBF at
+    // each source point with their proper radii
+    Kokkos::View<CoefficientType **, MemorySpace> phi = weightComputation(
+        space, source_ref_target, radii, _num_targets, _num_neighbors, rbf);
+
+    // We then need to create the Vandermonde matrix for each source point
+    // Instead of relying on an external type, could it be produced
+    // automatically?
+    Kokkos::View<CoefficientType ***, MemorySpace> p = vandermondeComputation(
+        space, source_ref_target, _num_targets, _num_neighbors, pd);
+
+    // From the weight and Vandermonde matrices, we can compute the moment
+    // matrix as A = P^T.PHI.P
+    Kokkos::View<CoefficientType ***, MemorySpace> a = momentComputation(
+        space, phi, p, _num_targets, _num_neighbors, polynomialBasisSize);
+
+    // We then take the pseudo-inverse of that moment matrix.
+    Kokkos::View<CoefficientType ***, MemorySpace> a_inv =
+        symmetricPseudoInverseSVD(space, a);
+
+    // We finally build the coefficients as C = [1 0 0 ...].A^-1.P^T.PHI
+    _coeffs = coefficientsComputation(space, phi, p, a_inv, _num_targets,
+                                      _num_neighbors, polynomialBasisSize);
+  }
+
+  template <typename ExecutionSpace, typename SourceValues>
+  Kokkos::View<typename SourceValues::non_const_value_type *,
+               typename SourceValues::memory_space>
+  apply(ExecutionSpace const &space, SourceValues const &source_values)
+  {
+    using value_t = typename SourceValues::non_const_value_type;
+    using memory_space = typename SourceValues::memory_space;
+
+    std::size_t num_neighbors = _num_neighbors;
+    Kokkos::View<CoefficientType **, MemorySpace> coeffs = _coeffs;
+
+    Kokkos::View<value_t *, memory_space> target_values(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::target_values"),
+        _num_targets);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::target_interpolation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          value_t tmp = 0;
+
+          for (int j = 0; j < num_neighbors; j++)
+          {
+            tmp += coeffs(i, j) * source_values(i * num_neighbors + j);
+          }
+
+          target_values(i) = tmp;
+        });
+
+    return target_values;
+  }
+
+  template <typename ExecutionSpace, typename SourcePoints,
+            typename TargetPoints>
+  static Kokkos::View<PointEquivalenceFromAT<SourcePoints> **, MemorySpace>
+  sourceRefTargetFill(ExecutionSpace const &space,
+                      SourcePoints const &source_points,
+                      TargetPoints const &target_points,
+                      std::size_t num_targets, std::size_t num_neighbors)
+  {
+    using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
+    using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+    using point_t = PointEquivalenceFromAT<SourcePoints>;
+
+    Kokkos::View<point_t **, MemorySpace> source_ref_target(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::source_ref_target"),
+        num_targets, num_neighbors);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::source_ref_target_fill",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          auto src = src_acc::get(source_points, i * num_neighbors + j);
+          auto tgt = tgt_acc::get(target_points, i);
+          point_t t{};
+
+          for (int k = 0; k < ArborX::GeometryTraits::dimension_v<point_t>; k++)
+            t[k] = src[k] - tgt[k];
+
+          source_ref_target(i, j) = t;
+        });
+
+    return source_ref_target;
+  }
+
+  template <typename ExecutionSpace, typename Point>
+  static Kokkos::View<CoefficientType *, MemorySpace>
+  radiiComputation(ExecutionSpace const &space,
+                   Kokkos::View<Point **, MemorySpace> const &source_ref_target,
+                   std::size_t num_targets, std::size_t num_neighbors)
+  {
+    constexpr CoefficientType epsilon =
+        std::numeric_limits<CoefficientType>::epsilon();
+    constexpr Point origin{};
+
+    Kokkos::View<CoefficientType *, MemorySpace> radii(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
+        num_targets);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::radii_computation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          CoefficientType radius = 10 * epsilon;
+
+          for (int j = 0; j < num_neighbors; j++)
+          {
+            CoefficientType norm =
+                ArborX::Details::distance(source_ref_target(i, j), origin);
+            radius = (radius < norm) ? norm : radius;
+          }
+
+          // The one at the limit would be valued at 0 due to how RBF works
+          radii(i) = 1.1 * radius;
+        });
+
+    return radii;
+  }
+
+  template <typename ExecutionSpace, typename RadialBasisFunction,
+            typename Point>
+  static Kokkos::View<CoefficientType **, MemorySpace> weightComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<Point **, MemorySpace> const &source_ref_target,
+      Kokkos::View<CoefficientType *, MemorySpace> const &radii,
+      std::size_t num_targets, std::size_t num_neighbors,
+      RadialBasisFunction const &)
+  {
+    constexpr Point origin{};
+
+    Kokkos::View<CoefficientType **, MemorySpace> phi(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
+        num_targets, num_neighbors);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::phi_computation",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          CoefficientType norm =
+              ArborX::Details::distance(source_ref_target(i, j), origin);
+          phi(i, j) = RadialBasisFunction::apply(norm / radii(i));
+        });
+
+    return phi;
+  }
+
+  template <typename ExecutionSpace, typename PolynomialDegree, typename Point>
+  static Kokkos::View<CoefficientType ***, MemorySpace> vandermondeComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<Point **, MemorySpace> const &source_ref_target,
+      std::size_t num_targets, std::size_t num_neighbors,
+      PolynomialDegree const &)
+  {
+    static constexpr std::size_t polynomialBasisSize =
+        polynomialBasisSizeFromT<Point, PolynomialDegree::value>;
+
+    Kokkos::View<CoefficientType ***, MemorySpace> p(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::vandermonde"),
+        num_targets, num_neighbors, polynomialBasisSize);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::vandermonde_computation",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          auto basis = polynomialBasis<Point, PolynomialDegree::value>(
+              source_ref_target(i, j));
+
+          for (int k = 0; k < polynomialBasisSize; k++)
+          {
+            p(i, j, k) = basis[k];
+          }
+        });
+
+    return p;
+  }
+
+  template <typename ExecutionSpace>
+  static Kokkos::View<CoefficientType ***, MemorySpace>
+  momentComputation(ExecutionSpace const &space,
+                    Kokkos::View<CoefficientType **, MemorySpace> const &phi,
+                    Kokkos::View<CoefficientType ***, MemorySpace> const &p,
+                    std::size_t num_targets, std::size_t num_neighbors,
+                    std::size_t polynomialBasisSize)
+  {
+    Kokkos::View<CoefficientType ***, MemorySpace> a(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::moment"),
+        num_targets, polynomialBasisSize, polynomialBasisSize);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::moment_computation",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+            space, {0, 0, 0},
+            {num_targets, polynomialBasisSize, polynomialBasisSize}),
+        KOKKOS_LAMBDA(int const i, int const j, int const k) {
+          CoefficientType tmp = 0;
+
+          for (int l = 0; l < num_neighbors; l++)
+          {
+            tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
+          }
+
+          a(i, j, k) = tmp;
+        });
+
+    return a;
+  }
+
+  template <typename ExecutionSpace>
+  static Kokkos::View<CoefficientType **, MemorySpace> coefficientsComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<CoefficientType **, MemorySpace> const &phi,
+      Kokkos::View<CoefficientType ***, MemorySpace> const &p,
+      Kokkos::View<CoefficientType ***, MemorySpace> const &a_inv,
+      std::size_t num_targets, std::size_t num_neighbors,
+      std::size_t polynomialBasisSize)
+  {
+    Kokkos::View<CoefficientType **, MemorySpace> coeffs(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::coefficients"),
+        num_targets, num_neighbors);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::coefficients_computation",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          CoefficientType tmp = 0;
+
+          for (int k = 0; k < polynomialBasisSize; k++)
+          {
+            tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
+          }
+
+          coeffs(i, j) = tmp;
+        });
+
+    return coeffs;
+  }
+
+private:
+  Kokkos::View<CoefficientType **, MemorySpace> _coeffs;
+  std::size_t _num_targets;
+  std::size_t _num_neighbors;
+};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsPolynomialBasis.hpp b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
new file mode 100644
index 000000000..14dfde948
--- /dev/null
+++ b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
@@ -0,0 +1,101 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <type_traits>
+
+namespace Details
+{
+
+template <std::size_t Dim, std::size_t Deg>
+KOKKOS_FUNCTION constexpr Kokkos::Array<Kokkos::Array<std::size_t, Dim>, Deg>
+polynomialBasisColumnSizes()
+{
+  Kokkos::Array<Kokkos::Array<std::size_t, Dim>, Deg> arr{};
+
+  for (std::size_t dim = 0; dim < Dim; dim++)
+    arr[0][dim] = 1;
+  for (std::size_t deg = 0; deg < Deg; deg++)
+    arr[deg][0] = 1;
+
+  for (std::size_t deg = 1; deg < Deg; deg++)
+    for (std::size_t dim = 1; dim < Dim; dim++)
+      arr[deg][dim] = arr[deg - 1][dim] + arr[deg][dim - 1];
+
+  return arr;
+}
+
+template <std::size_t Dim, std::size_t Deg>
+KOKKOS_FUNCTION constexpr std::size_t polynomialBasisSize()
+{
+  auto arr = polynomialBasisColumnSizes<Dim, Deg>();
+  std::size_t size = 1;
+
+  for (std::size_t deg = 0; deg < Deg; deg++)
+    for (std::size_t dim = 0; dim < Dim; dim++)
+      size += arr[deg][dim];
+
+  return size;
+}
+
+template <typename Point, std::size_t Deg>
+static constexpr std::size_t polynomialBasisSizeFromT =
+    polynomialBasisSize<ArborX::GeometryTraits::dimension_v<Point>, Deg>();
+
+template <typename Points, std::size_t Deg>
+static constexpr std::size_t polynomialBasisSizeFromAT =
+    polynomialBasisSizeFromT<
+        typename ArborX::Details::AccessTraitsHelper<
+            ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type,
+        Deg>;
+
+template <typename Point, std::size_t Deg>
+KOKKOS_FUNCTION auto polynomialBasis(Point const &p)
+{
+  static constexpr std::size_t dimension =
+      ArborX::GeometryTraits::dimension_v<Point>;
+  static constexpr auto column_details =
+      polynomialBasisColumnSizes<dimension, Deg>();
+  using value_t = typename ArborX::GeometryTraits::coordinate_type<Point>::type;
+
+  Kokkos::Array<value_t, polynomialBasisSize<dimension, Deg>()> arr{};
+  arr[0] = value_t(1);
+
+  std::size_t prev_col = 0;
+  std::size_t curr_col = 1;
+  for (std::size_t deg = 0; deg < Deg; deg++)
+  {
+    std::size_t loc_offset = curr_col;
+    for (std::size_t dim = 0; dim < dimension; dim++)
+    {
+      // copy the previous column and multply by p[dim]
+      for (std::size_t i = 0; i < column_details[deg][dim]; i++)
+        arr[loc_offset + i] = arr[prev_col + i] * p[dim];
+
+      loc_offset += column_details[deg][dim];
+    }
+
+    prev_col = curr_col;
+    curr_col = loc_offset;
+  }
+
+  return arr;
+}
+
+template <std::size_t Deg>
+static constexpr std::integral_constant<std::size_t, Deg> degree{};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
new file mode 100644
index 000000000..ad357852d
--- /dev/null
+++ b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
@@ -0,0 +1,77 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <Kokkos_Core.hpp>
+
+#include <cmath>
+
+#define RBF_DECL(name)                                                         \
+  template <int K>                                                             \
+  struct __##name;                                                             \
+                                                                               \
+  template <int K>                                                             \
+  static constexpr __##name<K> name                                            \
+  {}
+
+#define RBF_DEF(name, n, func)                                                 \
+  template <>                                                                  \
+  struct __##name<n>                                                           \
+  {                                                                            \
+    template <typename T>                                                      \
+    KOKKOS_INLINE_FUNCTION static T apply(T x)                                 \
+    {                                                                          \
+      return func;                                                             \
+    }                                                                          \
+  }
+
+namespace Details
+{
+
+RBF_DECL(wendland);
+RBF_DEF(wendland, 0, (1 - x) * (1 - x));
+RBF_DEF(wendland, 2, (1 - x) * (1 - x) * (1 - x) * (1 - x) * (4 * x + 1));
+RBF_DEF(wendland, 4,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (35 * x * x + 18 * x + 3));
+RBF_DEF(wendland, 6,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (1 - x) * (32 * x * x * x + 25 * x * x + 8 * x + 1));
+
+RBF_DECL(wu);
+RBF_DEF(wu, 2,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (3 * x * x * x + 12 * x + 16 * x + 4));
+RBF_DEF(wu, 4,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (5 * x * x * x * x * x + 30 * x * x * x * x + 72 * x * x * x +
+             82 * x * x + 36 * x + 6));
+
+RBF_DECL(buhmann);
+RBF_DEF(buhmann, 2,
+        2 * x * x * x * x * log(x) - T(7) / 2 * x * x * x * x +
+            T(16) / 3 * x * x * x - 2 * x * x + T(1) / 6);
+RBF_DEF(buhmann, 3,
+        1 * x * x * x * x * x * x * x * x - T(84) / 5 * x * x * x * x * x * x +
+            T(1024) / 5 * x * x * x * x * sqrt(x) - 378 * x * x * x * x +
+            T(1024) / 5 * x * x * x * sqrt(x) - T(84) / 5 * x * x + 1);
+RBF_DEF(buhmann, 4,
+        T(99) / 35 * x * x * x * x * x * x * x * x -
+            132 * x * x * x * x * x * x +
+            T(9216) / 35 * x * x * x * x * x * sqrt(x) -
+            T(11264) / 35 * x * x * x * x * sqrt(x) + 198 * x * x * x * x -
+            T(396) / 5 * x * x + 1);
+
+} // namespace Details
+
+#undef RBF_DECL
+#undef RBF_DEF
\ No newline at end of file
diff --git a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
new file mode 100644
index 000000000..6f55db2f5
--- /dev/null
+++ b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
@@ -0,0 +1,206 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <Kokkos_Core.hpp>
+
+#include <cmath>
+#include <limits>
+
+namespace Details
+{
+
+// This finds the biggest off-diagonal value of E.S as well as its
+// coordinates. Being symmetric, we can always check on the upper
+// triangle (and always have q > p)
+template <typename Matrices>
+KOKKOS_FUNCTION typename Matrices::non_const_value_type
+spisvdArgmaxOffDiagonal(Matrices const &es, int const i, int &p, int &q)
+{
+  using value_t = typename Matrices::non_const_value_type;
+
+  std::size_t const size = es.extent(1);
+  value_t max = 0;
+  p = q = 0;
+
+  for (int j = 0; j < size; j++)
+  {
+    for (int k = j + 1; k < size; k++)
+    {
+      value_t val = Kokkos::abs(es(i, j, k));
+      if (max < val)
+      {
+        max = val;
+        p = j;
+        q = k;
+      }
+    }
+  }
+
+  return max;
+}
+
+// Pseudo-inverse of symmetric matrices using SVD
+// We must find U, E (diagonal and positive) and V such that A = U.E.V^T
+// We also know that A is symmetric (by construction), so U = SV where S is
+// a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
+// Thus A = U.E.S.U^T and A^-1 = U.[ E^-1.S ].U^T
+template <typename ExecutionSpace, typename Matrices>
+Kokkos::View<typename Matrices::non_const_value_type ***,
+             typename Matrices::memory_space>
+symmetricPseudoInverseSVD(ExecutionSpace const &space, Matrices const &mats)
+{
+  using value_t = typename Matrices::non_const_value_type;
+  using memory_space = typename Matrices::memory_space;
+
+  std::size_t const num_matrices = mats.extent(0);
+  std::size_t const size = mats.extent(1);
+  constexpr value_t epsilon = std::numeric_limits<value_t>::epsilon();
+  constexpr value_t pi_4 = value_t(M_PI_4);
+
+  // ==> Initialisation
+  // E.S is the input matrix
+  // U is the identity
+  Kokkos::View<value_t ***, memory_space> es(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::ES"),
+      mats.layout());
+  Kokkos::View<value_t ***, memory_space> u(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::U"),
+      mats.layout());
+  Kokkos::parallel_for(
+      "Example::SPISVD::ES_U_init",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {num_matrices, size, size}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        es(i, j, k) = value_t(mats(i, j, k));
+        u(i, j, k) = value_t((j == k));
+      });
+
+  // ==> Loop
+  // Iterative approach, we will "deconstruct" E.S until only the diagonal
+  // is relevent inside the matrix
+  // It is possible to prove that, at each step, the "norm" of the matrix
+  // is strictly less that of the previous
+  // For all the loops, the following equality holds: A = U.E.S.U^T
+  Kokkos::parallel_for(
+      "Example::SPISVD::compute_ES_U",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, num_matrices),
+      KOKKOS_LAMBDA(int const i) {
+        int p, q;
+        value_t norm = spisvdArgmaxOffDiagonal(es, i, p, q);
+        while (norm > epsilon)
+        {
+          value_t a = es(i, p, p);
+          value_t b = es(i, p, q);
+          value_t c = es(i, q, q);
+
+          // Our submatrix is now
+          // +----------+----------+   +---+---+
+          // | es(p, p) | es(p, q) |   | a | b |
+          // +----------+----------+ = +---+---+
+          // | es(q, p) | es(q, q) |   | b | c |
+          // +----------+----------+   +---+---+
+
+          // Lets compute x, y and theta such that
+          // +---+---+              +---+---+
+          // | a | b |              | x | 0 |
+          // +---+---+ = R(theta) * +---+---+ * R(theta)^T
+          // | b | c |              | 0 | y |
+          // +---+---+              +---+---+
+
+          value_t theta, x, y;
+          if (a == c) // <-- better to check if |a - c| < epsilon?
+          {
+            theta = pi_4;
+            x = a + b;
+            y = a - b;
+          }
+          else
+          {
+            theta = Kokkos::atan((2 * b) / (a - c)) / 2;
+            value_t a_c_cos2 = (a - c) / Kokkos::cos(2 * theta);
+            x = (a + c + a_c_cos2) / 2;
+            y = (a + c - a_c_cos2) / 2;
+          }
+          value_t cos = Kokkos::cos(theta);
+          value_t sin = Kokkos::sin(theta);
+
+          // Now lets compute the following new values for U amd E.S
+          // E.S <- R'(theta)^T . E.S . R'(theta)
+          // U  <- U . R'(theta)
+
+          // R'(theta)^T . E.S
+          for (int j = 0; j < size; j++)
+          {
+            value_t es_ipj = es(i, p, j);
+            value_t es_iqj = es(i, q, j);
+            es(i, p, j) = cos * es_ipj + sin * es_iqj;
+            es(i, q, j) = -sin * es_ipj + cos * es_iqj;
+          }
+
+          // [R'(theta)^T . E.S] . R'(theta)
+          for (int j = 0; j < size; j++)
+          {
+            value_t es_ijp = es(i, j, p);
+            value_t es_ijq = es(i, j, q);
+            es(i, j, p) = cos * es_ijp + sin * es_ijq;
+            es(i, j, q) = -sin * es_ijp + cos * es_ijq;
+          }
+
+          // U . R'(theta)
+          for (int j = 0; j < size; j++)
+          {
+            value_t u_ijp = u(i, j, p);
+            value_t u_ijq = u(i, j, q);
+            u(i, j, p) = cos * u_ijp + sin * u_ijq;
+            u(i, j, q) = -sin * u_ijp + cos * u_ijq;
+          }
+
+          // These should theorically hold but is it ok to force them to their
+          // real value?
+          es(i, p, p) = x;
+          es(i, q, q) = y;
+          es(i, p, q) = 0;
+          es(i, q, p) = 0;
+
+          norm = spisvdArgmaxOffDiagonal(es, i, p, q);
+        }
+      });
+
+  // ==> Output
+  // U and E.S are computed, we can now build the inverse
+  // U.[ E^-1.S ].U^T
+  Kokkos::View<value_t ***, memory_space> inv(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::inv"),
+      mats.layout());
+  Kokkos::parallel_for(
+      "Example::SPISVD::inv_fill",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {num_matrices, size, size}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        value_t value = 0;
+        for (int l = 0; l < size; l++)
+        {
+          value_t v = es(i, l, l);
+          if (Kokkos::abs(v) > epsilon)
+          {
+            value += u(i, j, l) * u(i, k, l) / v;
+          }
+        }
+
+        inv(i, j, k) = value;
+      });
+
+  return inv;
+}
+
+} // namespace Details
\ No newline at end of file
diff --git a/examples/moving_least_squares/MovingLeastSquares.hpp b/examples/moving_least_squares/MovingLeastSquares.hpp
new file mode 100644
index 000000000..0f44b25a3
--- /dev/null
+++ b/examples/moving_least_squares/MovingLeastSquares.hpp
@@ -0,0 +1,110 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include "DetailsDistributedTreePostQueryComms.hpp"
+#include "DetailsMovingLeastSquaresComputation.hpp"
+#include "DetailsPolynomialBasis.hpp"
+
+namespace Details
+{
+
+// This is done to avoid clashing with another predicate access trait
+template <typename Points>
+struct TargetPointsPredicateWrapper
+{
+  Points target_points;
+  std::size_t num_neighbors;
+};
+
+} // namespace Details
+
+template <typename Points>
+struct ArborX::AccessTraits<Details::TargetPointsPredicateWrapper<Points>,
+                            ArborX::PredicatesTag>
+{
+  static KOKKOS_FUNCTION std::size_t
+  size(::Details::TargetPointsPredicateWrapper<Points> const &tp)
+  {
+    return ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+        tp.target_points);
+  }
+
+  static KOKKOS_FUNCTION auto
+  get(::Details::TargetPointsPredicateWrapper<Points> const &tp, std::size_t i)
+  {
+    return ArborX::nearest(
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
+            tp.target_points, i),
+        tp.num_neighbors);
+  }
+
+  using memory_space =
+      typename ArborX::AccessTraits<Points,
+                                    ArborX::PrimitivesTag>::memory_space;
+};
+
+// Public interface to compute the moving least squares approximation between a
+// souce and target point cloud
+template <typename MemorySpace, typename FloatingCalculationType = float>
+class MovingLeastSquares
+{
+public:
+  template <typename ExecutionSpace, typename RadialBasisFunction,
+            typename PolynomialDegree, typename SourcePoints,
+            typename TargetPoints>
+  MovingLeastSquares(
+      MPI_Comm comm, ExecutionSpace const &space,
+      SourcePoints const &source_points, TargetPoints const &target_points,
+      PolynomialDegree const &pd, RadialBasisFunction const &rbf,
+      std::size_t num_neighbors = Details::polynomialBasisSizeFromAT<
+          SourcePoints, PolynomialDegree::value>)
+  {
+    // Organize the source points as a tree and create the predicates
+    ArborX::DistributedTree<MemorySpace> source_tree(comm, space,
+                                                     source_points);
+    Details::TargetPointsPredicateWrapper<TargetPoints> predicates{
+        target_points, num_neighbors};
+
+    // Makes the NN query
+    Kokkos::View<ArborX::PairIndexRank *, MemorySpace> indices_and_ranks(
+        "Example::MLS::indices_and_ranks", 0);
+    Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
+    source_tree.query(space, predicates, indices_and_ranks, offsets);
+
+    // Set up comms and collect the points for a local MLS
+    _comms = Details::DistributedTreePostQueryComms<MemorySpace>(
+        comm, space, indices_and_ranks);
+    auto local_source_points = _comms.distribute(space, source_points);
+
+    // Finally, compute the local MLS for the local target points
+    _mlsc = Details::MovingLeastSquaresComputation<MemorySpace,
+                                                   FloatingCalculationType>(
+        space, local_source_points, target_points, pd, rbf);
+  }
+
+  template <typename ExecutionSpace, typename SourceValues>
+  auto apply(ExecutionSpace const &space, SourceValues const &source_values)
+  {
+    // Distribute and compute the result
+    return _mlsc.apply(space, _comms.distribute(space, source_values));
+  }
+
+private:
+  Details::MovingLeastSquaresComputation<MemorySpace, FloatingCalculationType>
+      _mlsc;
+  Details::DistributedTreePostQueryComms<MemorySpace> _comms;
+};
\ No newline at end of file
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
new file mode 100644
index 000000000..2a7b75583
--- /dev/null
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -0,0 +1,263 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+#include "../../benchmarks/point_clouds/point_clouds.hpp"
+#include "DetailsRadialBasisFunctions.hpp"
+#include "MovingLeastSquares.hpp"
+#include <mpi.h>
+
+using ExecutionSpace = Kokkos::DefaultExecutionSpace;
+using MemorySpace = ExecutionSpace::memory_space;
+
+using HostExecutionSpace = Kokkos::DefaultHostExecutionSpace;
+using HostMemorySpace = HostExecutionSpace::memory_space;
+
+// Function to approximate
+struct Step
+{
+  KOKKOS_INLINE_FUNCTION static float eval(ArborX::Point const &p)
+  {
+    return !Kokkos::signbit(p[0]) * 1.f;
+  }
+
+  template <class... Properties>
+  static Kokkos::View<float *, Properties...>
+  map(ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, Properties...> const &ps)
+  {
+    Kokkos::View<float *, Properties...> evals("Example::evals", ps.extent(0));
+    Kokkos::parallel_for(
+        "Example::evaluation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, ps.extent(0)),
+        KOKKOS_LAMBDA(int const i) { evals(i) = eval(ps(i)); });
+    return evals;
+  }
+};
+
+Kokkos::Array<Kokkos::View<ArborX::Point *, MemorySpace>, 2>
+createPointClouds(HostExecutionSpace const &hspace, ExecutionSpace const &space,
+                  MPI_Comm comm, std::size_t points_num)
+{
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(comm, &mpi_size);
+  MPI_Comm_rank(comm, &mpi_rank);
+
+  Kokkos::Array<Kokkos::View<ArborX::Point *, HostMemorySpace>, 2>
+      point_clouds_host{Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                               "Example::points_cloud_0"),
+                            points_num),
+                        Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                               "Example::points_cloud_1"),
+                            points_num)};
+
+  if (mpi_rank == 0)
+  {
+    Kokkos::Array<Kokkos::View<ArborX::Point *, HostMemorySpace>, 2>
+        all_point_clouds{Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                             Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                                "Example::all_points_cloud_0"),
+                             points_num * mpi_size),
+                         Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                             Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                                "Example::all_points_cloud_1"),
+                             points_num * mpi_size)};
+
+    filledBoxCloud(.5, all_point_clouds[0]);
+    filledBoxCloud(.5, all_point_clouds[1]);
+
+    MPI_Scatter(all_point_clouds[0].data(), points_num * 3 * sizeof(float),
+                MPI_BYTE, point_clouds_host[0].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+    MPI_Scatter(all_point_clouds[1].data(), points_num * 3 * sizeof(float),
+                MPI_BYTE, point_clouds_host[1].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+  }
+  else
+  {
+    MPI_Scatter(nullptr, 0, MPI_BYTE, point_clouds_host[0].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+    MPI_Scatter(nullptr, 0, MPI_BYTE, point_clouds_host[1].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+  }
+
+  Kokkos::parallel_for(
+      "Example::flatten_points",
+      Kokkos::RangePolicy<HostExecutionSpace>(hspace, 0, points_num),
+      KOKKOS_LAMBDA(int const i) {
+        point_clouds_host[0](i)[2] = 0;
+        point_clouds_host[1](i)[2] = 0;
+      });
+
+  Kokkos::Array<Kokkos::View<ArborX::Point *, MemorySpace>, 2> point_clouds{
+      Kokkos::View<ArborX::Point *, MemorySpace>(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "Example::points_cloud_0"),
+          points_num),
+      Kokkos::View<ArborX::Point *, MemorySpace>(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "Example::points_cloud_1"),
+          points_num)};
+  Kokkos::deep_copy(space, point_clouds[0], point_clouds_host[0]);
+  Kokkos::deep_copy(space, point_clouds[1], point_clouds_host[1]);
+
+  return point_clouds;
+}
+
+template <typename Deg, typename RBF>
+Kokkos::Array<MovingLeastSquares<MemorySpace, float>, 2> createMLSObjects(
+    MPI_Comm comm, ExecutionSpace const &space,
+    Kokkos::View<ArborX::Point *, MemorySpace> const &point_clouds_0,
+    Kokkos::View<ArborX::Point *, MemorySpace> const &point_clouds_1,
+    Deg const &deg, RBF const &rbf)
+{
+  return {MovingLeastSquares<MemorySpace, float>(comm, space, point_clouds_0,
+                                                 point_clouds_1, deg, rbf),
+          MovingLeastSquares<MemorySpace, float>(comm, space, point_clouds_1,
+                                                 point_clouds_0, deg, rbf)};
+}
+
+void doError(MPI_Comm comm, ExecutionSpace const &space,
+             Kokkos::View<ArborX::Point *, MemorySpace> const &points,
+             Kokkos::View<float *, MemorySpace> const &approx,
+             Kokkos::View<float *, MemorySpace> const &values)
+{
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(comm, &mpi_size);
+  MPI_Comm_rank(comm, &mpi_rank);
+
+  // Compute local error
+  using ErrType = typename Kokkos::MaxLoc<float, std::size_t>::value_type;
+  ErrType error{0, 0};
+  float error_sum = 0;
+  Kokkos::parallel_reduce(
+      "Example::error_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, approx.extent(0)),
+      KOKKOS_LAMBDA(int const i, ErrType &loc_error, float &loc_error_sum) {
+        float abs_error = Kokkos::abs(approx(i) - values(i));
+
+        loc_error_sum += abs_error;
+        if (loc_error.val < abs_error)
+        {
+          loc_error.val = abs_error;
+          loc_error.loc = i;
+        }
+      },
+      Kokkos::MaxLoc<float, std::size_t>(error), Kokkos::Sum<float>(error_sum));
+
+  auto approx_host = Kokkos::create_mirror_view(approx);
+  auto points_host = Kokkos::create_mirror_view(points);
+  Kokkos::deep_copy(space, approx_host, approx);
+  Kokkos::deep_copy(space, points_host, points);
+
+  std::tuple<float, ArborX::Point, float> error_obj{
+      error.val, points_host(error.loc), approx_host(error.loc)};
+
+  // Compute global error
+  if (mpi_rank == 0)
+  {
+    float error_sum_global;
+    std::vector<decltype(error_obj)> all_error_obj(mpi_size);
+    MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE,
+               all_error_obj.data(), sizeof(decltype(error_obj)), MPI_BYTE, 0,
+               comm);
+    MPI_Reduce(&error_sum, &error_sum_global, 1, MPI_FLOAT, MPI_SUM, 0, comm);
+
+    for (int i = 0; i < mpi_size; i++)
+      if (std::get<0>(error_obj) < std::get<0>(all_error_obj[i]))
+        error_obj = all_error_obj[i];
+
+    float error = std::get<0>(error_obj), approx = std::get<2>(error_obj);
+    auto point = std::get<1>(error_obj);
+    std::cout << "Mean error: "
+              << error_sum_global / (points.extent(0) * mpi_size)
+              << "\nMaximum error: " << error << " at point " << point[0]
+              << ", " << point[1] << "\n  True value:  " << Step::eval(point)
+              << "\n  Computed:    " << approx << std::endl;
+  }
+  else
+  {
+    MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE, nullptr,
+               sizeof(decltype(error_obj)), MPI_BYTE, 0, comm);
+    MPI_Reduce(&error_sum, nullptr, 1, MPI_FLOAT, MPI_SUM, 0, comm);
+  }
+}
+
+Kokkos::View<float *, MemorySpace>
+doOne(MPI_Comm comm, ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &tgt,
+      Kokkos::View<float *, MemorySpace> const &values,
+      Kokkos::View<float *, MemorySpace> const &true_values,
+      MovingLeastSquares<MemorySpace, float> &mls)
+{
+  auto tgt_values = mls.apply(space, values);
+  doError(comm, space, tgt, tgt_values, true_values);
+  return tgt_values;
+}
+
+int main(int argc, char *argv[])
+{
+  static constexpr std::size_t total_points = 1024 * 128;
+  static constexpr std::size_t num_back_forth = 50;
+  static constexpr auto deg = Details::degree<4>;
+  static constexpr auto rbf = Details::wu<2>;
+
+  MPI_Init(&argc, &argv);
+  Kokkos::ScopeGuard guard(argc, argv);
+
+  ExecutionSpace space{};
+  HostExecutionSpace host_space{};
+  MPI_Comm mpi_comm = MPI_COMM_WORLD;
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(mpi_comm, &mpi_size);
+  MPI_Comm_rank(mpi_comm, &mpi_rank);
+
+  auto point_clouds =
+      createPointClouds(host_space, space, mpi_comm, total_points / mpi_size);
+
+  // Create the transform from a point cloud to another
+  auto mlss = createMLSObjects(mpi_comm, space, point_clouds[0],
+                               point_clouds[1], deg, rbf);
+
+  Kokkos::Array<Kokkos::View<float *, MemorySpace>, 2> true_values{
+      Step::map(space, point_clouds[0]), Step::map(space, point_clouds[1])};
+
+  Kokkos::View<float *, MemorySpace> source_values = true_values[0];
+  for (int i = 0; i < num_back_forth * 2; i++)
+  {
+    if (mpi_rank == 0)
+      std::cout << "=== TURN " << i + 1 << std::endl;
+
+    Kokkos::View<ArborX::Point *, MemorySpace> target =
+        point_clouds[1 - (i % 2)];
+    Kokkos::View<float *, MemorySpace> tgt_true_values =
+        true_values[1 - (i % 2)];
+    MovingLeastSquares<MemorySpace, float> &mls = mlss[i % 2];
+
+    source_values =
+        doOne(mpi_comm, space, target, source_values, tgt_true_values, mls);
+
+    if (mpi_rank == 0)
+      std::cout << "===\n" << std::endl;
+  }
+
+  MPI_Finalize();
+  return 0;
+}