From 12105e86effa76970776821e1e1689e643269bcc Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 28 Jul 2023 13:38:57 -0400
Subject: [PATCH 01/44] New example program

---
 .gitignore                                         |  1 +
 examples/CMakeLists.txt                            |  1 +
 examples/moving_least_squares/CMakeLists.txt       |  3 +++
 .../moving_least_squares/moving_least_squares.cpp  | 14 ++++++++++++++
 4 files changed, 19 insertions(+)
 create mode 100644 examples/moving_least_squares/CMakeLists.txt
 create mode 100644 examples/moving_least_squares/moving_least_squares.cpp

diff --git a/.gitignore b/.gitignore
index 48439bce0..488cc1fd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.swp
 .#*
 /build*
+.vscode
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 389a6bcdf..6d486bc72 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(callback)
 add_subdirectory(dbscan)
 add_subdirectory(molecular_dynamics)
 add_subdirectory(simple_intersection)
+add_subdirectory(moving_least_squares)
 
 find_package(Boost COMPONENTS program_options)
 if(Boost_FOUND)
diff --git a/examples/moving_least_squares/CMakeLists.txt b/examples/moving_least_squares/CMakeLists.txt
new file mode 100644
index 000000000..d9d9c6e45
--- /dev/null
+++ b/examples/moving_least_squares/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(ArborX_Example_MovingLeastSquare.exe moving_least_squares.cpp)
+target_link_libraries(ArborX_Example_MovingLeastSquare.exe ArborX::ArborX)
+add_test(NAME ArborX_Example_MovingLeastSquare COMMAND ArborX_Example_MovingLeastSquare.exe)
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
new file mode 100644
index 000000000..fb4fc5368
--- /dev/null
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -0,0 +1,14 @@
+/****************************************************************************
+ * Copyright (c) 2017-2023 by the ArborX authors                            *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <ArborX.hpp>
+
+int main(int argc, char *argv[]) { return 0; }

From 9289fc65f2442db21261fa141d01b3fd7f9bdb17 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 28 Jul 2023 16:29:47 -0400
Subject: [PATCH 02/44] Source and target point creation

---
 .../moving_least_squares.cpp                  | 105 +++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index fb4fc5368..9fc54a098 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -9,6 +9,109 @@
  * SPDX-License-Identifier: BSD-3-Clause                                    *
  ****************************************************************************/
 
+// Example taken from DataTransferKit
+// (https://github.com/ORNL-CEES/DataTransferKit)
+
 #include <ArborX.hpp>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+
+using ExecutionSpace = Kokkos::DefaultExecutionSpace;
+using MemorySpace = ExecutionSpace::memory_space;
+
+std::ostream &operator<<(std::ostream &os, ArborX::Point const &p)
+{
+  os << '(' << p[0] << ',' << p[1] << ',' << p[2] << ')';
+  return os;
+}
+
+class RBFWendland_0
+{
+public:
+  RBFWendland_0(double radius)
+      : _radius(radius)
+  {}
+
+  KOKKOS_INLINE_FUNCTION double operator()(double x)
+  {
+    x /= _radius;
+    return (1. - x) * (1. - x);
+  }
+
+private:
+  double _radius;
+};
+
+struct MVPolynomialBasis_Quad_3D
+{
+  static constexpr std::size_t size = 10;
+
+  template <typename Double3D>
+  KOKKOS_INLINE_FUNCTION Kokkos::Array<double, size>
+  operator()(Double3D const &p) const
+  {
+    return {{1., p[0], p[1], p[2], p[0] * p[0], p[0] * p[1], p[0] * p[2],
+             p[1] * p[1], p[1] * p[2], p[2] * p[2]}};
+  }
+};
+
+// Func to evaluate
+template <typename Double3D>
+KOKKOS_INLINE_FUNCTION double func(Double3D const &p) {
+  return Kokkos::sin(p[0]) * Kokkos::cos(p[1]) + p[2];
+} 
+
+int main(int argc, char *argv[])
+{
+  Kokkos::ScopeGuard guard(argc, argv);
+
+  constexpr float cube_half_side = 10.;              // [-10, 10]^3 cube
+  constexpr float cube_side = 2 * cube_half_side;
+  constexpr std::size_t source_points_side = 100; // [-10, 10]^3 grid
+  constexpr std::size_t target_points_num = 10'000;   // random [-10, 10]^3
+
+  constexpr std::size_t source_points_num =
+    source_points_side * source_points_side * source_points_side;
+
+  auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
+    "source_points", source_points_num);
+  auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
+    "target_points", target_points_num);
+
+  // Generate source points
+  Kokkos::parallel_for(
+    "source_fill",
+    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+      {0, 0, 0},
+      {source_points_side, source_points_side, source_points_side}),
+    KOKKOS_LAMBDA (int const i, int const j, int const k) {
+      source_points(
+        i * source_points_side * source_points_side +
+        j * source_points_side +
+        k
+      ) = ArborX::Point {
+        (static_cast<float>(i) / (source_points_side - 1) - .5f) * cube_side,
+        (static_cast<float>(j) / (source_points_side - 1) - .5f) * cube_side,
+        (static_cast<float>(k) / (source_points_side - 1) - .5f) * cube_side
+      };
+  });
+
+  // Generate target points
+  auto random_pool =
+    Kokkos::Random_XorShift64_Pool<ExecutionSpace>(time(nullptr));
+  Kokkos::parallel_for(
+    "target_fill",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      auto gen = random_pool.get_state();
+      target_points(i) = ArborX::Point {
+        gen.frand(0., 1.),
+        gen.frand(0., 1.),
+        gen.frand(0., 1.),
+      };
+    });
 
-int main(int argc, char *argv[]) { return 0; }
+  // Arrange source points as tree
+  auto source_tree =
+    ArborX::BVH<MemorySpace>(ExecutionSpace{}, source_points);
+}

From 8c97beb345e7bfbeaa6ddc16b5948371451f6723 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Mon, 31 Jul 2023 16:30:45 -0400
Subject: [PATCH 03/44] Completion of MLS (not tested)

---
 .../moving_least_squares.cpp                  | 216 +++++++++++++++++-
 1 file changed, 207 insertions(+), 9 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 9fc54a098..63da4f979 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -11,11 +11,16 @@
 
 // Example taken from DataTransferKit
 // (https://github.com/ORNL-CEES/DataTransferKit)
+// with MLS resolution from
+// (http://dx.doi.org/10.1016/j.jcp.2015.11.055)
 
 #include <ArborX.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 
+#include <limits>
+#include <iomanip>
+
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
 
@@ -25,20 +30,14 @@ std::ostream &operator<<(std::ostream &os, ArborX::Point const &p)
   return os;
 }
 
-class RBFWendland_0
+struct RBFWendland_0
 {
-public:
-  RBFWendland_0(double radius)
-      : _radius(radius)
-  {}
-
   KOKKOS_INLINE_FUNCTION double operator()(double x)
   {
     x /= _radius;
     return (1. - x) * (1. - x);
   }
 
-private:
   double _radius;
 };
 
@@ -65,10 +64,11 @@ int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
 
-  constexpr float cube_half_side = 10.;              // [-10, 10]^3 cube
+  constexpr float cube_half_side = 10.; // [-10, 10]^3 cube
   constexpr float cube_side = 2 * cube_half_side;
   constexpr std::size_t source_points_side = 100; // [-10, 10]^3 grid
-  constexpr std::size_t target_points_num = 10'000;   // random [-10, 10]^3
+  constexpr std::size_t target_points_num = 10'000; // random [-10, 10]^3
+  constexpr std::size_t num_neighbors = MVPolynomialBasis_Quad_3D::size; // ???
 
   constexpr std::size_t source_points_num =
     source_points_side * source_points_side * source_points_side;
@@ -114,4 +114,202 @@ int main(int argc, char *argv[])
   // Arrange source points as tree
   auto source_tree =
     ArborX::BVH<MemorySpace>(ExecutionSpace{}, source_points);
+  
+  // Create the queries
+  // For each target point we query the closest source points
+  auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point>*, MemorySpace>(
+    "queries", target_points_num);
+  Kokkos::parallel_for(
+    "make_queries",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      queries(i) = ArborX::nearest(target_points(i), num_neighbors);
+  });
+
+  // Perform the query
+  auto indices = Kokkos::View<int *, MemorySpace>("indices", 0);
+  auto offsets = Kokkos::View<int *, MemorySpace>("offsets", 0);
+  source_tree.query(ExecutionSpace{}, queries, indices, offsets);
+
+  // Now that we have the neighbors, we recompute their position using
+  // their target point as the origin.
+  // This is used as an optimisation later in the algorithm
+  auto tr_source_points = Kokkos::View<ArborX::Point**, MemorySpace>(
+    "tr_source_points", target_points_num, num_neighbors);
+  Kokkos::parallel_for(
+    "transform_source_points",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      for (int j = offsets(i); j < offsets(i+1); j++) {
+        tr_source_points(i, j - offsets(i)) = ArborX::Point {
+          source_points(j)[0] - target_points(i)[0],
+          source_points(j)[1] - target_points(i)[1],
+          source_points(j)[2] - target_points(i)[2],
+        };
+      }
+  });
+
+  // Compute the radii for the weight (phi) vector
+  auto radii = Kokkos::View<double*, MemorySpace>(
+    "radii", target_points_num);
+  constexpr double epsilon = std::numeric_limits<double>::epsilon();
+  Kokkos::parallel_for(
+    "radii_computation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      double radius = 10. * epsilon;
+
+      for (int j = 0; j < num_neighbors; j++) {
+        double norm = ArborX::Details::distance(
+          tr_source_points(i, j),
+          ArborX::Point{0., 0., 0.});
+        radius = (radius < norm) ? norm : radius;
+      }
+
+      radii(i) = 1.1 * radius;
+  });
+
+  // Compute the weight (phi) vector
+  auto phi = Kokkos::View<double**, MemorySpace>(
+    "phi", target_points_num, num_neighbors);
+  Kokkos::parallel_for(
+    "phi_computation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, phi.extent(0)),
+    KOKKOS_LAMBDA (const int i) {
+      auto rbf = RBFWendland_0 { radii(i) };
+
+      for (int j = 0; j < phi.extent(1); j++) {
+        double norm = ArborX::Details::distance(
+          tr_source_points(i, j),
+          ArborX::Point{0., 0., 0.});
+        phi(i, j) = rbf(norm);
+      }
+  });
+
+  // Compute multivariable Vandermonde (P) matrix
+  auto p = Kokkos::View<double***, MemorySpace>(
+    "vandermonde",
+      target_points_num,
+      num_neighbors,
+      MVPolynomialBasis_Quad_3D::size
+  );
+  Kokkos::parallel_for(
+    "vandermonde_computation",
+    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+      {0, 0}, {target_points_num, num_neighbors}),
+    KOKKOS_LAMBDA (const int i, const int j) {
+      auto basis = MVPolynomialBasis_Quad_3D{}(tr_source_points(i, j));
+
+      for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
+        p(i, j, k) = basis[k];
+      }
+  });
+
+  // Compute moment (A) matrix
+  auto a = Kokkos::View<double***, MemorySpace>(
+    "A",
+      target_points_num,
+      MVPolynomialBasis_Quad_3D::size,
+      MVPolynomialBasis_Quad_3D::size
+  );
+  Kokkos::parallel_for(
+    "A_computation",
+    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+      {0, 0, 0},
+      {
+        target_points_num,
+        MVPolynomialBasis_Quad_3D::size,
+        MVPolynomialBasis_Quad_3D::size
+    }),
+    KOKKOS_LAMBDA (const int i, const int j, const int k) {
+      double tmp = 0;
+      for (int l = 0; l < num_neighbors; l++) {
+        tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
+      }
+
+      a(i, j, k) = tmp;
+  });
+
+  // Inverse moment matrix
+  // Gaussian inverse method. Both matrix are used and modifications on the
+  // first one are applied to the second
+  // Kind of works, errors out quite often.
+  // A better method should be employed (SVD?)
+  auto a_inv = Kokkos::View<double***, MemorySpace>(
+    "A_inv",
+      target_points_num,
+      MVPolynomialBasis_Quad_3D::size,
+      MVPolynomialBasis_Quad_3D::size
+  );
+  Kokkos::parallel_for(
+    "A_inv_computation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      for (int j = 0; j < MVPolynomialBasis_Quad_3D::size; j++) {
+        for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
+          a_inv(i, j, k) = (j == k) * 1.;
+        }
+      }
+
+      // This needs to be done for every column
+      for (int j = 0; j < MVPolynomialBasis_Quad_3D::size; j++) {
+
+        // We find the line with a non-negative element on column j
+        int k = j;
+        for (; k < MVPolynomialBasis_Quad_3D::size; k++) {
+          if (a(i, k, j) != 0.0) break;
+        }
+
+        // We divide the line with said value
+        double tmp = a(i, k, j);
+        for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+          a(i, k, l) /= tmp;
+          a_inv(i, k, l) /= tmp;
+        }
+
+        // If line and column are not the same, move the column to the top
+        if (k != j) {
+          for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+            double tmp = a(i, k, l);
+            a(i, k, l) = a(i, j, l);
+            a(i, j, l) = tmp;
+
+            tmp = a_inv(i, k, l);
+            a_inv(i, k, l) = a_inv(i, j, l);
+            a_inv(i, j, l) = tmp;
+          }
+        }
+
+        // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
+        for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+          if (l == j || a(i, l, j) == 0.0) continue;
+          double mul = a(i, l, j);
+
+          for (int m = 0; m < MVPolynomialBasis_Quad_3D::size; m++) {
+            a(i, l, m) -= mul * a(i, j, m);
+            a_inv(i, l, m) -= mul * a_inv(i, j, m);
+          }
+          a(i, l, j) = 0.0;
+        }
+
+        // Now a_inv should contain the inverse of a
+      }
+  });
+
+  // Compute the coefficients
+  auto coeffs = Kokkos::View<double**, MemorySpace>(
+    "coefficients", target_points_num, MVPolynomialBasis_Quad_3D::size);
+  Kokkos::parallel_for(
+    "coefficients_computation",
+    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+      {0, 0}, {target_points_num, MVPolynomialBasis_Quad_3D::size}),
+    KOKKOS_LAMBDA (const int i, const int j) {
+      double tmp = 0;
+
+      for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
+        tmp += a_inv(i, 0, j) * p(i, k, j) * phi(i, k);
+      }
+
+      coeffs(i, j) = tmp;
+  });
 }

From ea3a2b4d946a09453084ed3eca02fde3ea8f3daf Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 1 Aug 2023 11:38:36 -0400
Subject: [PATCH 04/44] Small fixes and values computation

---
 .../moving_least_squares.cpp                  | 170 ++++++++++--------
 1 file changed, 95 insertions(+), 75 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 63da4f979..8c6fc7eb4 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -16,20 +16,12 @@
 
 #include <ArborX.hpp>
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Random.hpp>
 
 #include <limits>
-#include <iomanip>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
 
-std::ostream &operator<<(std::ostream &os, ArborX::Point const &p)
-{
-  os << '(' << p[0] << ',' << p[1] << ',' << p[2] << ')';
-  return os;
-}
-
 struct RBFWendland_0
 {
   KOKKOS_INLINE_FUNCTION double operator()(double x)
@@ -41,75 +33,57 @@ struct RBFWendland_0
   double _radius;
 };
 
-struct MVPolynomialBasis_Quad_3D
+struct MVPolynomialBasis_3D
 {
-  static constexpr std::size_t size = 10;
+  static constexpr std::size_t size = 4;
 
   template <typename Double3D>
   KOKKOS_INLINE_FUNCTION Kokkos::Array<double, size>
   operator()(Double3D const &p) const
   {
-    return {{1., p[0], p[1], p[2], p[0] * p[0], p[0] * p[1], p[0] * p[2],
-             p[1] * p[1], p[1] * p[2], p[2] * p[2]}};
+    return {{1., p[0], p[1], p[2]}};
   }
 };
 
 // Func to evaluate
 template <typename Double3D>
 KOKKOS_INLINE_FUNCTION double func(Double3D const &p) {
-  return Kokkos::sin(p[0]) * Kokkos::cos(p[1]) + p[2];
+  return Kokkos::sin(p[0]) + Kokkos::cos(p[1]) + p[2];
 } 
 
 int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
-
-  constexpr float cube_half_side = 10.; // [-10, 10]^3 cube
-  constexpr float cube_side = 2 * cube_half_side;
-  constexpr std::size_t source_points_side = 100; // [-10, 10]^3 grid
-  constexpr std::size_t target_points_num = 10'000; // random [-10, 10]^3
-  constexpr std::size_t num_neighbors = MVPolynomialBasis_Quad_3D::size; // ???
-
-  constexpr std::size_t source_points_num =
-    source_points_side * source_points_side * source_points_side;
+  constexpr std::size_t num_neighbors = 10;
+  constexpr std::size_t source_points_num = 10;
+  constexpr std::size_t target_points_num = 4;
 
   auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
     "source_points", source_points_num);
   auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
     "target_points", target_points_num);
+  auto source_points_host = Kokkos::create_mirror_view(source_points);
+  auto target_points_host = Kokkos::create_mirror_view(target_points);
 
   // Generate source points
-  Kokkos::parallel_for(
-    "source_fill",
-    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-      {0, 0, 0},
-      {source_points_side, source_points_side, source_points_side}),
-    KOKKOS_LAMBDA (int const i, int const j, int const k) {
-      source_points(
-        i * source_points_side * source_points_side +
-        j * source_points_side +
-        k
-      ) = ArborX::Point {
-        (static_cast<float>(i) / (source_points_side - 1) - .5f) * cube_side,
-        (static_cast<float>(j) / (source_points_side - 1) - .5f) * cube_side,
-        (static_cast<float>(k) / (source_points_side - 1) - .5f) * cube_side
-      };
-  });
+  source_points_host(0) = ArborX::Point {  1.,  1.,  0. };
+  source_points_host(1) = ArborX::Point { -1.,  1.,  0. };
+  source_points_host(2) = ArborX::Point { -1., -1.,  0. };
+  source_points_host(3) = ArborX::Point {  1., -1.,  0. };
+  source_points_host(4) = ArborX::Point {  0.,  0.,  1. };
+  source_points_host(5) = ArborX::Point {  1.,  1.,  2. };
+  source_points_host(6) = ArborX::Point { -1.,  1.,  2. };
+  source_points_host(7) = ArborX::Point { -1., -1.,  2. };
+  source_points_host(8) = ArborX::Point {  1., -1.,  2. };
+  source_points_host(9) = ArborX::Point {  0.,  0., -1. };
+  Kokkos::deep_copy(source_points, source_points_host);
 
   // Generate target points
-  auto random_pool =
-    Kokkos::Random_XorShift64_Pool<ExecutionSpace>(time(nullptr));
-  Kokkos::parallel_for(
-    "target_fill",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      auto gen = random_pool.get_state();
-      target_points(i) = ArborX::Point {
-        gen.frand(0., 1.),
-        gen.frand(0., 1.),
-        gen.frand(0., 1.),
-      };
-    });
+  target_points_host(0) = ArborX::Point {  0.,  0.,  0. };
+  target_points_host(1) = ArborX::Point {  .5,  .5,  0. };
+  target_points_host(2) = ArborX::Point { -.5,  .5,  1. };
+  target_points_host(3) = ArborX::Point {  0., -.5, 1.5 };
+  Kokkos::deep_copy(target_points, target_points_host);
 
   // Arrange source points as tree
   auto source_tree =
@@ -142,9 +116,9 @@ int main(int argc, char *argv[])
     KOKKOS_LAMBDA (const int i) {
       for (int j = offsets(i); j < offsets(i+1); j++) {
         tr_source_points(i, j - offsets(i)) = ArborX::Point {
-          source_points(j)[0] - target_points(i)[0],
-          source_points(j)[1] - target_points(i)[1],
-          source_points(j)[2] - target_points(i)[2],
+          source_points(indices(j))[0] - target_points(i)[0],
+          source_points(indices(j))[1] - target_points(i)[1],
+          source_points(indices(j))[2] - target_points(i)[2],
         };
       }
   });
@@ -191,16 +165,16 @@ int main(int argc, char *argv[])
     "vandermonde",
       target_points_num,
       num_neighbors,
-      MVPolynomialBasis_Quad_3D::size
+      MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
     "vandermonde_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
       {0, 0}, {target_points_num, num_neighbors}),
     KOKKOS_LAMBDA (const int i, const int j) {
-      auto basis = MVPolynomialBasis_Quad_3D{}(tr_source_points(i, j));
+      auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
 
-      for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
+      for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
         p(i, j, k) = basis[k];
       }
   });
@@ -209,8 +183,8 @@ int main(int argc, char *argv[])
   auto a = Kokkos::View<double***, MemorySpace>(
     "A",
       target_points_num,
-      MVPolynomialBasis_Quad_3D::size,
-      MVPolynomialBasis_Quad_3D::size
+      MVPolynomialBasis_3D::size,
+      MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
     "A_computation",
@@ -218,8 +192,8 @@ int main(int argc, char *argv[])
       {0, 0, 0},
       {
         target_points_num,
-        MVPolynomialBasis_Quad_3D::size,
-        MVPolynomialBasis_Quad_3D::size
+        MVPolynomialBasis_3D::size,
+        MVPolynomialBasis_3D::size
     }),
     KOKKOS_LAMBDA (const int i, const int j, const int k) {
       double tmp = 0;
@@ -238,38 +212,38 @@ int main(int argc, char *argv[])
   auto a_inv = Kokkos::View<double***, MemorySpace>(
     "A_inv",
       target_points_num,
-      MVPolynomialBasis_Quad_3D::size,
-      MVPolynomialBasis_Quad_3D::size
+      MVPolynomialBasis_3D::size,
+      MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
     "A_inv_computation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
-      for (int j = 0; j < MVPolynomialBasis_Quad_3D::size; j++) {
-        for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
+      for (int j = 0; j < MVPolynomialBasis_3D::size; j++) {
+        for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
           a_inv(i, j, k) = (j == k) * 1.;
         }
       }
 
       // This needs to be done for every column
-      for (int j = 0; j < MVPolynomialBasis_Quad_3D::size; j++) {
+      for (int j = 0; j < MVPolynomialBasis_3D::size; j++) {
 
         // We find the line with a non-negative element on column j
         int k = j;
-        for (; k < MVPolynomialBasis_Quad_3D::size; k++) {
+        for (; k < MVPolynomialBasis_3D::size; k++) {
           if (a(i, k, j) != 0.0) break;
         }
 
         // We divide the line with said value
         double tmp = a(i, k, j);
-        for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+        for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
           a(i, k, l) /= tmp;
           a_inv(i, k, l) /= tmp;
         }
 
         // If line and column are not the same, move the column to the top
         if (k != j) {
-          for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+          for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
             double tmp = a(i, k, l);
             a(i, k, l) = a(i, j, l);
             a(i, j, l) = tmp;
@@ -281,11 +255,11 @@ int main(int argc, char *argv[])
         }
 
         // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
-        for (int l = 0; l < MVPolynomialBasis_Quad_3D::size; l++) {
+        for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
           if (l == j || a(i, l, j) == 0.0) continue;
           double mul = a(i, l, j);
 
-          for (int m = 0; m < MVPolynomialBasis_Quad_3D::size; m++) {
+          for (int m = 0; m < MVPolynomialBasis_3D::size; m++) {
             a(i, l, m) -= mul * a(i, j, m);
             a_inv(i, l, m) -= mul * a_inv(i, j, m);
           }
@@ -298,18 +272,64 @@ int main(int argc, char *argv[])
 
   // Compute the coefficients
   auto coeffs = Kokkos::View<double**, MemorySpace>(
-    "coefficients", target_points_num, MVPolynomialBasis_Quad_3D::size);
+    "coefficients", target_points_num, num_neighbors);
   Kokkos::parallel_for(
     "coefficients_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
-      {0, 0}, {target_points_num, MVPolynomialBasis_Quad_3D::size}),
+      {0, 0}, {target_points_num, num_neighbors}),
     KOKKOS_LAMBDA (const int i, const int j) {
       double tmp = 0;
 
-      for (int k = 0; k < MVPolynomialBasis_Quad_3D::size; k++) {
-        tmp += a_inv(i, 0, j) * p(i, k, j) * phi(i, k);
+      for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
+        tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
       }
 
       coeffs(i, j) = tmp;
   });
+
+  // Compute source values
+  auto source_values = Kokkos::View<double*, MemorySpace>(
+    "source_values", source_points_num);
+  Kokkos::parallel_for(
+    "source_evaluation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, source_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      source_values(i) = func(source_points(i));
+  });
+
+  // Compute target values via interpolation
+  auto target_values = Kokkos::View<double*, MemorySpace>(
+    "target_values", target_points_num);
+  Kokkos::parallel_for(
+    "target_interpolation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      double tmp = 0;
+      for (int j = offsets(i); j < offsets(i+i); j++) {
+        tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
+      }
+      target_values(i) = tmp;
+  });
+
+  // Compute target values via evaluation
+  auto target_values_exact = Kokkos::View<double*, MemorySpace>(
+    "target_values_exact", target_points_num);
+  Kokkos::parallel_for(
+    "target_evaluation",
+    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+    KOKKOS_LAMBDA (const int i) {
+      target_values_exact(i) = func(target_points(i));
+  });
+
+  // Show difference
+  auto target_values_host = Kokkos::create_mirror_view(target_values);
+  Kokkos::deep_copy(target_values_host, target_values);
+  auto target_values_exact_host = Kokkos::create_mirror_view(target_values_exact);
+  Kokkos::deep_copy(target_values_exact_host, target_values_exact);
+
+  for (int i = 0; i < target_points_num; i++) {
+    std::cout << "====\n"
+              << target_values_host(i) << '\n'
+              << target_values_exact_host(i) << "\n====\n";
+  }
 }

From 357bda3043757afc71490af1c282a71c23a763e2 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 1 Aug 2023 15:39:20 -0400
Subject: [PATCH 05/44] Small fixup (memory and kernel names, removing
 templates, ...)

---
 .../moving_least_squares.cpp                  | 97 ++++++++++---------
 1 file changed, 51 insertions(+), 46 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 8c6fc7eb4..be10663da 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
- * Copyright (c) 2017-2023 by the ArborX authors                            *
+ * Copyright (c) 2023 by the ArborX authors                                 *
  * All rights reserved.                                                     *
  *                                                                          *
  * This file is part of the ArborX library. ArborX is                       *
@@ -37,31 +37,32 @@ struct MVPolynomialBasis_3D
 {
   static constexpr std::size_t size = 4;
 
-  template <typename Double3D>
   KOKKOS_INLINE_FUNCTION Kokkos::Array<double, size>
-  operator()(Double3D const &p) const
+  operator()(ArborX::Point const &p) const
   {
     return {{1., p[0], p[1], p[2]}};
   }
 };
 
-// Func to evaluate
-template <typename Double3D>
-KOKKOS_INLINE_FUNCTION double func(Double3D const &p) {
-  return Kokkos::sin(p[0]) + Kokkos::cos(p[1]) + p[2];
+// Function to approximate
+KOKKOS_INLINE_FUNCTION double manufactured_solution(ArborX::Point const &p)
+{
+  return p[2] + p[0];
 } 
 
 int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
-  constexpr std::size_t num_neighbors = 10;
+  constexpr std::size_t num_neighbors = 5;
   constexpr std::size_t source_points_num = 10;
   constexpr std::size_t target_points_num = 4;
 
   auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-    "source_points", source_points_num);
+    Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::source_points"),
+    source_points_num);
   auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-    "target_points", target_points_num);
+    Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::target_points"),
+    target_points_num);
   auto source_points_host = Kokkos::create_mirror_view(source_points);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
@@ -80,38 +81,37 @@ int main(int argc, char *argv[])
 
   // Generate target points
   target_points_host(0) = ArborX::Point {  0.,  0.,  0. };
-  target_points_host(1) = ArborX::Point {  .5,  .5,  0. };
+  target_points_host(1) = ArborX::Point {  .5,  .5,  .5 };
   target_points_host(2) = ArborX::Point { -.5,  .5,  1. };
-  target_points_host(3) = ArborX::Point {  0., -.5, 1.5 };
+  target_points_host(3) = ArborX::Point {  .1, -.33, 1.5 };
   Kokkos::deep_copy(target_points, target_points_host);
 
-  // Arrange source points as tree
-  auto source_tree =
-    ArborX::BVH<MemorySpace>(ExecutionSpace{}, source_points);
+  // Organize source points as tree
+  ArborX::BVH<MemorySpace> source_tree(ExecutionSpace{}, source_points);
   
   // Create the queries
   // For each target point we query the closest source points
   auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point>*, MemorySpace>(
-    "queries", target_points_num);
+    "MLS_EX::queries", target_points_num);
   Kokkos::parallel_for(
-    "make_queries",
+    "MLS_EX::make_queries",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
       queries(i) = ArborX::nearest(target_points(i), num_neighbors);
   });
 
   // Perform the query
-  auto indices = Kokkos::View<int *, MemorySpace>("indices", 0);
-  auto offsets = Kokkos::View<int *, MemorySpace>("offsets", 0);
+  auto indices = Kokkos::View<int *, MemorySpace>("MLS_EX::indices", 0);
+  auto offsets = Kokkos::View<int *, MemorySpace>("MLS_EX::offsets", 0);
   source_tree.query(ExecutionSpace{}, queries, indices, offsets);
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
   // This is used as an optimisation later in the algorithm
   auto tr_source_points = Kokkos::View<ArborX::Point**, MemorySpace>(
-    "tr_source_points", target_points_num, num_neighbors);
+    "MLS_EX::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "transform_source_points",
+    "MLS_EX::transform_source_points",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
       for (int j = offsets(i); j < offsets(i+1); j++) {
@@ -125,10 +125,10 @@ int main(int argc, char *argv[])
 
   // Compute the radii for the weight (phi) vector
   auto radii = Kokkos::View<double*, MemorySpace>(
-    "radii", target_points_num);
+    "MLS_EX::radii", target_points_num);
   constexpr double epsilon = std::numeric_limits<double>::epsilon();
   Kokkos::parallel_for(
-    "radii_computation",
+    "MLS_EX::radii_computation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
       double radius = 10. * epsilon;
@@ -145,9 +145,9 @@ int main(int argc, char *argv[])
 
   // Compute the weight (phi) vector
   auto phi = Kokkos::View<double**, MemorySpace>(
-    "phi", target_points_num, num_neighbors);
+    "MLS_EX::phi", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "phi_computation",
+    "MLS_EX::phi_computation",
     Kokkos::RangePolicy<ExecutionSpace>(0, phi.extent(0)),
     KOKKOS_LAMBDA (const int i) {
       auto rbf = RBFWendland_0 { radii(i) };
@@ -162,13 +162,13 @@ int main(int argc, char *argv[])
 
   // Compute multivariable Vandermonde (P) matrix
   auto p = Kokkos::View<double***, MemorySpace>(
-    "vandermonde",
+    "MLS_EX::vandermonde",
       target_points_num,
       num_neighbors,
       MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
-    "vandermonde_computation",
+    "MLS_EX::vandermonde_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
       {0, 0}, {target_points_num, num_neighbors}),
     KOKKOS_LAMBDA (const int i, const int j) {
@@ -181,13 +181,13 @@ int main(int argc, char *argv[])
 
   // Compute moment (A) matrix
   auto a = Kokkos::View<double***, MemorySpace>(
-    "A",
+    "MLS_EX::A",
       target_points_num,
       MVPolynomialBasis_3D::size,
       MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
-    "A_computation",
+    "MLS_EX::A_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
       {0, 0, 0},
       {
@@ -210,13 +210,13 @@ int main(int argc, char *argv[])
   // Kind of works, errors out quite often.
   // A better method should be employed (SVD?)
   auto a_inv = Kokkos::View<double***, MemorySpace>(
-    "A_inv",
+    "MLS_EX::A_inv",
       target_points_num,
       MVPolynomialBasis_3D::size,
       MVPolynomialBasis_3D::size
   );
   Kokkos::parallel_for(
-    "A_inv_computation",
+    "MLS_EX::A_inv_computation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
       for (int j = 0; j < MVPolynomialBasis_3D::size; j++) {
@@ -272,9 +272,9 @@ int main(int argc, char *argv[])
 
   // Compute the coefficients
   auto coeffs = Kokkos::View<double**, MemorySpace>(
-    "coefficients", target_points_num, num_neighbors);
+    "MLS_EX::coefficients", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "coefficients_computation",
+    "MLS_EX::coefficients_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
       {0, 0}, {target_points_num, num_neighbors}),
     KOKKOS_LAMBDA (const int i, const int j) {
@@ -289,19 +289,19 @@ int main(int argc, char *argv[])
 
   // Compute source values
   auto source_values = Kokkos::View<double*, MemorySpace>(
-    "source_values", source_points_num);
+    "MLS_EX::source_values", source_points_num);
   Kokkos::parallel_for(
-    "source_evaluation",
+    "MLS_EX::source_evaluation",
     Kokkos::RangePolicy<ExecutionSpace>(0, source_points_num),
     KOKKOS_LAMBDA (const int i) {
-      source_values(i) = func(source_points(i));
+      source_values(i) = manufactured_solution(source_points(i));
   });
 
   // Compute target values via interpolation
   auto target_values = Kokkos::View<double*, MemorySpace>(
-    "target_values", target_points_num);
+    "MLS_EX::target_values", target_points_num);
   Kokkos::parallel_for(
-    "target_interpolation",
+    "MLS_EX::target_interpolation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
       double tmp = 0;
@@ -313,23 +313,28 @@ int main(int argc, char *argv[])
 
   // Compute target values via evaluation
   auto target_values_exact = Kokkos::View<double*, MemorySpace>(
-    "target_values_exact", target_points_num);
+    "MLS_EX::target_values_exact", target_points_num);
   Kokkos::parallel_for(
-    "target_evaluation",
+    "MLS_EX::target_evaluation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
-      target_values_exact(i) = func(target_points(i));
+      target_values_exact(i) = manufactured_solution(target_points(i));
   });
 
   // Show difference
-  auto target_values_host = Kokkos::create_mirror_view(target_values);
+  auto target_values_host =
+    Kokkos::create_mirror_view(target_values);
   Kokkos::deep_copy(target_values_host, target_values);
-  auto target_values_exact_host = Kokkos::create_mirror_view(target_values_exact);
+  auto target_values_exact_host =
+    Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(target_values_exact_host, target_values_exact);
 
   for (int i = 0; i < target_points_num; i++) {
-    std::cout << "====\n"
-              << target_values_host(i) << '\n'
-              << target_values_exact_host(i) << "\n====\n";
+    double error =
+      Kokkos::abs(target_values_host(i) - target_values_exact_host(i));
+    std::cout << "==== Target " << i << '\n'
+              << "Interpolation: " << target_values_host(i) << '\n'
+              << "Real value   : " << target_values_exact_host(i) << '\n'
+              << "Absolute err.: " << error << "\n====\n";
   }
 }

From 4c22a2285b0807e2e089de3bdf1e23983cc55434 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 1 Aug 2023 15:51:04 -0400
Subject: [PATCH 06/44] source points generation (20x20x20 cube)

---
 .../moving_least_squares.cpp                  | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index be10663da..a30a8d47b 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -54,7 +54,8 @@ int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
   constexpr std::size_t num_neighbors = 5;
-  constexpr std::size_t source_points_num = 10;
+  constexpr std::size_t cube_side = 4;
+  constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
 
   auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
@@ -63,27 +64,28 @@ int main(int argc, char *argv[])
   auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
     Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::target_points"),
     target_points_num);
-  auto source_points_host = Kokkos::create_mirror_view(source_points);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
-  // Generate source points
-  source_points_host(0) = ArborX::Point {  1.,  1.,  0. };
-  source_points_host(1) = ArborX::Point { -1.,  1.,  0. };
-  source_points_host(2) = ArborX::Point { -1., -1.,  0. };
-  source_points_host(3) = ArborX::Point {  1., -1.,  0. };
-  source_points_host(4) = ArborX::Point {  0.,  0.,  1. };
-  source_points_host(5) = ArborX::Point {  1.,  1.,  2. };
-  source_points_host(6) = ArborX::Point { -1.,  1.,  2. };
-  source_points_host(7) = ArborX::Point { -1., -1.,  2. };
-  source_points_host(8) = ArborX::Point {  1., -1.,  2. };
-  source_points_host(9) = ArborX::Point {  0.,  0., -1. };
-  Kokkos::deep_copy(source_points, source_points_host);
+  // Generate source points (Organized within a [-10, 10]^3 cube)
+  Kokkos::parallel_for(
+    "MLS_EX::source_points_init",
+    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+      {0, 0, 0}, {cube_side, cube_side, cube_side}),
+    KOKKOS_LAMBDA (const int i, const int j, const int k) {
+      source_points(i * cube_side * cube_side +
+                    j * cube_side +
+                    k ) = ArborX::Point {
+        20.f * (float(i) / (cube_side - 1) - .5f),
+        20.f * (float(j) / (cube_side - 1) - .5f),
+        20.f * (float(k) / (cube_side - 1) - .5f)
+      };
+  });
 
   // Generate target points
-  target_points_host(0) = ArborX::Point {  0.,  0.,  0. };
-  target_points_host(1) = ArborX::Point {  .5,  .5,  .5 };
-  target_points_host(2) = ArborX::Point { -.5,  .5,  1. };
-  target_points_host(3) = ArborX::Point {  .1, -.33, 1.5 };
+  target_points_host(0) = ArborX::Point {  0.f,   0.f,  0.f };
+  target_points_host(1) = ArborX::Point {  5.f,   5.f,  5.f };
+  target_points_host(2) = ArborX::Point { -5.f,   5.f,  3.f };
+  target_points_host(3) = ArborX::Point {  1.f, -3.3f,  7.f };
   Kokkos::deep_copy(target_points, target_points_host);
 
   // Organize source points as tree

From 25864002aad4cba9e1dbaffa359b9b5e4ca34173 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 1 Aug 2023 15:59:51 -0400
Subject: [PATCH 07/44] double to float

---
 .../moving_least_squares.cpp                  | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index a30a8d47b..bcec2991c 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -24,20 +24,20 @@ using MemorySpace = ExecutionSpace::memory_space;
 
 struct RBFWendland_0
 {
-  KOKKOS_INLINE_FUNCTION double operator()(double x)
+  KOKKOS_INLINE_FUNCTION float operator()(float x)
   {
     x /= _radius;
     return (1. - x) * (1. - x);
   }
 
-  double _radius;
+  float _radius;
 };
 
 struct MVPolynomialBasis_3D
 {
   static constexpr std::size_t size = 4;
 
-  KOKKOS_INLINE_FUNCTION Kokkos::Array<double, size>
+  KOKKOS_INLINE_FUNCTION Kokkos::Array<float, size>
   operator()(ArborX::Point const &p) const
   {
     return {{1., p[0], p[1], p[2]}};
@@ -45,7 +45,7 @@ struct MVPolynomialBasis_3D
 };
 
 // Function to approximate
-KOKKOS_INLINE_FUNCTION double manufactured_solution(ArborX::Point const &p)
+KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
   return p[2] + p[0];
 } 
@@ -126,17 +126,17 @@ int main(int argc, char *argv[])
   });
 
   // Compute the radii for the weight (phi) vector
-  auto radii = Kokkos::View<double*, MemorySpace>(
+  auto radii = Kokkos::View<float*, MemorySpace>(
     "MLS_EX::radii", target_points_num);
-  constexpr double epsilon = std::numeric_limits<double>::epsilon();
+  constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
     "MLS_EX::radii_computation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
-      double radius = 10. * epsilon;
+      float radius = 10. * epsilon;
 
       for (int j = 0; j < num_neighbors; j++) {
-        double norm = ArborX::Details::distance(
+        float norm = ArborX::Details::distance(
           tr_source_points(i, j),
           ArborX::Point{0., 0., 0.});
         radius = (radius < norm) ? norm : radius;
@@ -146,7 +146,7 @@ int main(int argc, char *argv[])
   });
 
   // Compute the weight (phi) vector
-  auto phi = Kokkos::View<double**, MemorySpace>(
+  auto phi = Kokkos::View<float**, MemorySpace>(
     "MLS_EX::phi", target_points_num, num_neighbors);
   Kokkos::parallel_for(
     "MLS_EX::phi_computation",
@@ -155,7 +155,7 @@ int main(int argc, char *argv[])
       auto rbf = RBFWendland_0 { radii(i) };
 
       for (int j = 0; j < phi.extent(1); j++) {
-        double norm = ArborX::Details::distance(
+        float norm = ArborX::Details::distance(
           tr_source_points(i, j),
           ArborX::Point{0., 0., 0.});
         phi(i, j) = rbf(norm);
@@ -163,7 +163,7 @@ int main(int argc, char *argv[])
   });
 
   // Compute multivariable Vandermonde (P) matrix
-  auto p = Kokkos::View<double***, MemorySpace>(
+  auto p = Kokkos::View<float***, MemorySpace>(
     "MLS_EX::vandermonde",
       target_points_num,
       num_neighbors,
@@ -182,7 +182,7 @@ int main(int argc, char *argv[])
   });
 
   // Compute moment (A) matrix
-  auto a = Kokkos::View<double***, MemorySpace>(
+  auto a = Kokkos::View<float***, MemorySpace>(
     "MLS_EX::A",
       target_points_num,
       MVPolynomialBasis_3D::size,
@@ -198,7 +198,7 @@ int main(int argc, char *argv[])
         MVPolynomialBasis_3D::size
     }),
     KOKKOS_LAMBDA (const int i, const int j, const int k) {
-      double tmp = 0;
+      float tmp = 0;
       for (int l = 0; l < num_neighbors; l++) {
         tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
       }
@@ -211,7 +211,7 @@ int main(int argc, char *argv[])
   // first one are applied to the second
   // Kind of works, errors out quite often.
   // A better method should be employed (SVD?)
-  auto a_inv = Kokkos::View<double***, MemorySpace>(
+  auto a_inv = Kokkos::View<float***, MemorySpace>(
     "MLS_EX::A_inv",
       target_points_num,
       MVPolynomialBasis_3D::size,
@@ -237,7 +237,7 @@ int main(int argc, char *argv[])
         }
 
         // We divide the line with said value
-        double tmp = a(i, k, j);
+        float tmp = a(i, k, j);
         for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
           a(i, k, l) /= tmp;
           a_inv(i, k, l) /= tmp;
@@ -246,7 +246,7 @@ int main(int argc, char *argv[])
         // If line and column are not the same, move the column to the top
         if (k != j) {
           for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
-            double tmp = a(i, k, l);
+            float tmp = a(i, k, l);
             a(i, k, l) = a(i, j, l);
             a(i, j, l) = tmp;
 
@@ -259,7 +259,7 @@ int main(int argc, char *argv[])
         // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
         for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
           if (l == j || a(i, l, j) == 0.0) continue;
-          double mul = a(i, l, j);
+          float mul = a(i, l, j);
 
           for (int m = 0; m < MVPolynomialBasis_3D::size; m++) {
             a(i, l, m) -= mul * a(i, j, m);
@@ -273,14 +273,14 @@ int main(int argc, char *argv[])
   });
 
   // Compute the coefficients
-  auto coeffs = Kokkos::View<double**, MemorySpace>(
+  auto coeffs = Kokkos::View<float**, MemorySpace>(
     "MLS_EX::coefficients", target_points_num, num_neighbors);
   Kokkos::parallel_for(
     "MLS_EX::coefficients_computation",
     Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
       {0, 0}, {target_points_num, num_neighbors}),
     KOKKOS_LAMBDA (const int i, const int j) {
-      double tmp = 0;
+      float tmp = 0;
 
       for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
         tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
@@ -290,7 +290,7 @@ int main(int argc, char *argv[])
   });
 
   // Compute source values
-  auto source_values = Kokkos::View<double*, MemorySpace>(
+  auto source_values = Kokkos::View<float*, MemorySpace>(
     "MLS_EX::source_values", source_points_num);
   Kokkos::parallel_for(
     "MLS_EX::source_evaluation",
@@ -300,13 +300,13 @@ int main(int argc, char *argv[])
   });
 
   // Compute target values via interpolation
-  auto target_values = Kokkos::View<double*, MemorySpace>(
+  auto target_values = Kokkos::View<float*, MemorySpace>(
     "MLS_EX::target_values", target_points_num);
   Kokkos::parallel_for(
     "MLS_EX::target_interpolation",
     Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
     KOKKOS_LAMBDA (const int i) {
-      double tmp = 0;
+      float tmp = 0;
       for (int j = offsets(i); j < offsets(i+i); j++) {
         tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
       }
@@ -314,7 +314,7 @@ int main(int argc, char *argv[])
   });
 
   // Compute target values via evaluation
-  auto target_values_exact = Kokkos::View<double*, MemorySpace>(
+  auto target_values_exact = Kokkos::View<float*, MemorySpace>(
     "MLS_EX::target_values_exact", target_points_num);
   Kokkos::parallel_for(
     "MLS_EX::target_evaluation",
@@ -332,7 +332,7 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(target_values_exact_host, target_values_exact);
 
   for (int i = 0; i < target_points_num; i++) {
-    double error =
+    float error =
       Kokkos::abs(target_values_host(i) - target_values_exact_host(i));
     std::cout << "==== Target " << i << '\n'
               << "Interpolation: " << target_values_host(i) << '\n'

From da4671bbb5142aa1b59341bbe5f209a6660fa745 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 1 Aug 2023 16:04:47 -0400
Subject: [PATCH 08/44] clang format

---
 .../moving_least_squares.cpp                  | 405 +++++++++---------
 1 file changed, 203 insertions(+), 202 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index bcec2991c..af23ba5eb 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -15,6 +15,7 @@
 // (http://dx.doi.org/10.1016/j.jcp.2015.11.055)
 
 #include <ArborX.hpp>
+
 #include <Kokkos_Core.hpp>
 
 #include <limits>
@@ -48,7 +49,7 @@ struct MVPolynomialBasis_3D
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
   return p[2] + p[0];
-} 
+}
 
 int main(int argc, char *argv[])
 {
@@ -59,48 +60,45 @@ int main(int argc, char *argv[])
   constexpr std::size_t target_points_num = 4;
 
   auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-    Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::source_points"),
-    source_points_num);
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::source_points"),
+      source_points_num);
   auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-    Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::target_points"),
-    target_points_num);
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::target_points"),
+      target_points_num);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
   // Generate source points (Organized within a [-10, 10]^3 cube)
   Kokkos::parallel_for(
-    "MLS_EX::source_points_init",
-    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-      {0, 0, 0}, {cube_side, cube_side, cube_side}),
-    KOKKOS_LAMBDA (const int i, const int j, const int k) {
-      source_points(i * cube_side * cube_side +
-                    j * cube_side +
-                    k ) = ArborX::Point {
-        20.f * (float(i) / (cube_side - 1) - .5f),
-        20.f * (float(j) / (cube_side - 1) - .5f),
-        20.f * (float(k) / (cube_side - 1) - .5f)
-      };
-  });
+      "MLS_EX::source_points_init",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          {0, 0, 0}, {cube_side, cube_side, cube_side}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        source_points(i * cube_side * cube_side + j * cube_side + k) =
+            ArborX::Point{20.f * (float(i) / (cube_side - 1) - .5f),
+                          20.f * (float(j) / (cube_side - 1) - .5f),
+                          20.f * (float(k) / (cube_side - 1) - .5f)};
+      });
 
   // Generate target points
-  target_points_host(0) = ArborX::Point {  0.f,   0.f,  0.f };
-  target_points_host(1) = ArborX::Point {  5.f,   5.f,  5.f };
-  target_points_host(2) = ArborX::Point { -5.f,   5.f,  3.f };
-  target_points_host(3) = ArborX::Point {  1.f, -3.3f,  7.f };
+  target_points_host(0) = ArborX::Point{0.f, 0.f, 0.f};
+  target_points_host(1) = ArborX::Point{5.f, 5.f, 5.f};
+  target_points_host(2) = ArborX::Point{-5.f, 5.f, 3.f};
+  target_points_host(3) = ArborX::Point{1.f, -3.3f, 7.f};
   Kokkos::deep_copy(target_points, target_points_host);
 
   // Organize source points as tree
   ArborX::BVH<MemorySpace> source_tree(ExecutionSpace{}, source_points);
-  
+
   // Create the queries
   // For each target point we query the closest source points
-  auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point>*, MemorySpace>(
-    "MLS_EX::queries", target_points_num);
+  auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point> *, MemorySpace>(
+      "MLS_EX::queries", target_points_num);
   Kokkos::parallel_for(
-    "MLS_EX::make_queries",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      queries(i) = ArborX::nearest(target_points(i), num_neighbors);
-  });
+      "MLS_EX::make_queries",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        queries(i) = ArborX::nearest(target_points(i), num_neighbors);
+      });
 
   // Perform the query
   auto indices = Kokkos::View<int *, MemorySpace>("MLS_EX::indices", 0);
@@ -110,230 +108,233 @@ int main(int argc, char *argv[])
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
   // This is used as an optimisation later in the algorithm
-  auto tr_source_points = Kokkos::View<ArborX::Point**, MemorySpace>(
-    "MLS_EX::tr_source_points", target_points_num, num_neighbors);
+  auto tr_source_points = Kokkos::View<ArborX::Point **, MemorySpace>(
+      "MLS_EX::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "MLS_EX::transform_source_points",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      for (int j = offsets(i); j < offsets(i+1); j++) {
-        tr_source_points(i, j - offsets(i)) = ArborX::Point {
-          source_points(indices(j))[0] - target_points(i)[0],
-          source_points(indices(j))[1] - target_points(i)[1],
-          source_points(indices(j))[2] - target_points(i)[2],
-        };
-      }
-  });
+      "MLS_EX::transform_source_points",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        for (int j = offsets(i); j < offsets(i + 1); j++)
+        {
+          tr_source_points(i, j - offsets(i)) = ArborX::Point{
+              source_points(indices(j))[0] - target_points(i)[0],
+              source_points(indices(j))[1] - target_points(i)[1],
+              source_points(indices(j))[2] - target_points(i)[2],
+          };
+        }
+      });
 
   // Compute the radii for the weight (phi) vector
-  auto radii = Kokkos::View<float*, MemorySpace>(
-    "MLS_EX::radii", target_points_num);
+  auto radii =
+      Kokkos::View<float *, MemorySpace>("MLS_EX::radii", target_points_num);
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
-    "MLS_EX::radii_computation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      float radius = 10. * epsilon;
-
-      for (int j = 0; j < num_neighbors; j++) {
-        float norm = ArborX::Details::distance(
-          tr_source_points(i, j),
-          ArborX::Point{0., 0., 0.});
-        radius = (radius < norm) ? norm : radius;
-      }
+      "MLS_EX::radii_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        float radius = 10. * epsilon;
+
+        for (int j = 0; j < num_neighbors; j++)
+        {
+          float norm = ArborX::Details::distance(tr_source_points(i, j),
+                                                 ArborX::Point{0., 0., 0.});
+          radius = (radius < norm) ? norm : radius;
+        }
 
-      radii(i) = 1.1 * radius;
-  });
+        radii(i) = 1.1 * radius;
+      });
 
   // Compute the weight (phi) vector
-  auto phi = Kokkos::View<float**, MemorySpace>(
-    "MLS_EX::phi", target_points_num, num_neighbors);
+  auto phi = Kokkos::View<float **, MemorySpace>(
+      "MLS_EX::phi", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "MLS_EX::phi_computation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, phi.extent(0)),
-    KOKKOS_LAMBDA (const int i) {
-      auto rbf = RBFWendland_0 { radii(i) };
-
-      for (int j = 0; j < phi.extent(1); j++) {
-        float norm = ArborX::Details::distance(
-          tr_source_points(i, j),
-          ArborX::Point{0., 0., 0.});
-        phi(i, j) = rbf(norm);
-      }
-  });
+      "MLS_EX::phi_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, phi.extent(0)),
+      KOKKOS_LAMBDA(int const i) {
+        auto rbf = RBFWendland_0{radii(i)};
+
+        for (int j = 0; j < phi.extent(1); j++)
+        {
+          float norm = ArborX::Details::distance(tr_source_points(i, j),
+                                                 ArborX::Point{0., 0., 0.});
+          phi(i, j) = rbf(norm);
+        }
+      });
 
   // Compute multivariable Vandermonde (P) matrix
-  auto p = Kokkos::View<float***, MemorySpace>(
-    "MLS_EX::vandermonde",
-      target_points_num,
-      num_neighbors,
-      MVPolynomialBasis_3D::size
-  );
+  auto p = Kokkos::View<float ***, MemorySpace>(
+      "MLS_EX::vandermonde", target_points_num, num_neighbors,
+      MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-    "MLS_EX::vandermonde_computation",
-    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
-      {0, 0}, {target_points_num, num_neighbors}),
-    KOKKOS_LAMBDA (const int i, const int j) {
-      auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
-
-      for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
-        p(i, j, k) = basis[k];
-      }
-  });
+      "MLS_EX::vandermonde_computation",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+          {0, 0}, {target_points_num, num_neighbors}),
+      KOKKOS_LAMBDA(int const i, int const j) {
+        auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
+
+        for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
+        {
+          p(i, j, k) = basis[k];
+        }
+      });
 
   // Compute moment (A) matrix
-  auto a = Kokkos::View<float***, MemorySpace>(
-    "MLS_EX::A",
-      target_points_num,
-      MVPolynomialBasis_3D::size,
-      MVPolynomialBasis_3D::size
-  );
+  auto a = Kokkos::View<float ***, MemorySpace>("MLS_EX::A", target_points_num,
+                                                MVPolynomialBasis_3D::size,
+                                                MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-    "MLS_EX::A_computation",
-    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-      {0, 0, 0},
-      {
-        target_points_num,
-        MVPolynomialBasis_3D::size,
-        MVPolynomialBasis_3D::size
-    }),
-    KOKKOS_LAMBDA (const int i, const int j, const int k) {
-      float tmp = 0;
-      for (int l = 0; l < num_neighbors; l++) {
-        tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
-      }
-
-      a(i, j, k) = tmp;
-  });
+      "MLS_EX::A_computation",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          {0, 0, 0}, {target_points_num, MVPolynomialBasis_3D::size,
+                      MVPolynomialBasis_3D::size}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        float tmp = 0;
+        for (int l = 0; l < num_neighbors; l++)
+        {
+          tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
+        }
+
+        a(i, j, k) = tmp;
+      });
 
   // Inverse moment matrix
   // Gaussian inverse method. Both matrix are used and modifications on the
   // first one are applied to the second
   // Kind of works, errors out quite often.
   // A better method should be employed (SVD?)
-  auto a_inv = Kokkos::View<float***, MemorySpace>(
-    "MLS_EX::A_inv",
-      target_points_num,
-      MVPolynomialBasis_3D::size,
-      MVPolynomialBasis_3D::size
-  );
+  auto a_inv = Kokkos::View<float ***, MemorySpace>(
+      "MLS_EX::A_inv", target_points_num, MVPolynomialBasis_3D::size,
+      MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-    "MLS_EX::A_inv_computation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      for (int j = 0; j < MVPolynomialBasis_3D::size; j++) {
-        for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
-          a_inv(i, j, k) = (j == k) * 1.;
-        }
-      }
-
-      // This needs to be done for every column
-      for (int j = 0; j < MVPolynomialBasis_3D::size; j++) {
-
-        // We find the line with a non-negative element on column j
-        int k = j;
-        for (; k < MVPolynomialBasis_3D::size; k++) {
-          if (a(i, k, j) != 0.0) break;
+      "MLS_EX::A_inv_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
+        {
+          for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
+          {
+            a_inv(i, j, k) = (j == k) * 1.;
+          }
         }
 
-        // We divide the line with said value
-        float tmp = a(i, k, j);
-        for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
-          a(i, k, l) /= tmp;
-          a_inv(i, k, l) /= tmp;
-        }
+        // This needs to be done for every column
+        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
+        {
 
-        // If line and column are not the same, move the column to the top
-        if (k != j) {
-          for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
-            float tmp = a(i, k, l);
-            a(i, k, l) = a(i, j, l);
-            a(i, j, l) = tmp;
+          // We find the line with a non-negative element on column j
+          int k = j;
+          for (; k < MVPolynomialBasis_3D::size; k++)
+          {
+            if (a(i, k, j) != 0.0)
+              break;
+          }
 
-            tmp = a_inv(i, k, l);
-            a_inv(i, k, l) = a_inv(i, j, l);
-            a_inv(i, j, l) = tmp;
+          // We divide the line with said value
+          float tmp = a(i, k, j);
+          for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
+          {
+            a(i, k, l) /= tmp;
+            a_inv(i, k, l) /= tmp;
           }
-        }
 
-        // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
-        for (int l = 0; l < MVPolynomialBasis_3D::size; l++) {
-          if (l == j || a(i, l, j) == 0.0) continue;
-          float mul = a(i, l, j);
+          // If line and column are not the same, move the column to the top
+          if (k != j)
+          {
+            for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
+            {
+              float tmp = a(i, k, l);
+              a(i, k, l) = a(i, j, l);
+              a(i, j, l) = tmp;
+
+              tmp = a_inv(i, k, l);
+              a_inv(i, k, l) = a_inv(i, j, l);
+              a_inv(i, j, l) = tmp;
+            }
+          }
 
-          for (int m = 0; m < MVPolynomialBasis_3D::size; m++) {
-            a(i, l, m) -= mul * a(i, j, m);
-            a_inv(i, l, m) -= mul * a_inv(i, j, m);
+          // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
+          for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
+          {
+            if (l == j || a(i, l, j) == 0.0)
+              continue;
+            float mul = a(i, l, j);
+
+            for (int m = 0; m < MVPolynomialBasis_3D::size; m++)
+            {
+              a(i, l, m) -= mul * a(i, j, m);
+              a_inv(i, l, m) -= mul * a_inv(i, j, m);
+            }
+            a(i, l, j) = 0.0;
           }
-          a(i, l, j) = 0.0;
-        }
 
-        // Now a_inv should contain the inverse of a
-      }
-  });
+          // Now a_inv should contain the inverse of a
+        }
+      });
 
   // Compute the coefficients
-  auto coeffs = Kokkos::View<float**, MemorySpace>(
-    "MLS_EX::coefficients", target_points_num, num_neighbors);
+  auto coeffs = Kokkos::View<float **, MemorySpace>(
+      "MLS_EX::coefficients", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-    "MLS_EX::coefficients_computation",
-    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
-      {0, 0}, {target_points_num, num_neighbors}),
-    KOKKOS_LAMBDA (const int i, const int j) {
-      float tmp = 0;
-
-      for (int k = 0; k < MVPolynomialBasis_3D::size; k++) {
-        tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
-      }
+      "MLS_EX::coefficients_computation",
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+          {0, 0}, {target_points_num, num_neighbors}),
+      KOKKOS_LAMBDA(int const i, int const j) {
+        float tmp = 0;
+
+        for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
+        {
+          tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
+        }
 
-      coeffs(i, j) = tmp;
-  });
+        coeffs(i, j) = tmp;
+      });
 
   // Compute source values
-  auto source_values = Kokkos::View<float*, MemorySpace>(
-    "MLS_EX::source_values", source_points_num);
+  auto source_values = Kokkos::View<float *, MemorySpace>(
+      "MLS_EX::source_values", source_points_num);
   Kokkos::parallel_for(
-    "MLS_EX::source_evaluation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, source_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      source_values(i) = manufactured_solution(source_points(i));
-  });
+      "MLS_EX::source_evaluation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, source_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        source_values(i) = manufactured_solution(source_points(i));
+      });
 
   // Compute target values via interpolation
-  auto target_values = Kokkos::View<float*, MemorySpace>(
-    "MLS_EX::target_values", target_points_num);
+  auto target_values = Kokkos::View<float *, MemorySpace>(
+      "MLS_EX::target_values", target_points_num);
   Kokkos::parallel_for(
-    "MLS_EX::target_interpolation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      float tmp = 0;
-      for (int j = offsets(i); j < offsets(i+i); j++) {
-        tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
-      }
-      target_values(i) = tmp;
-  });
+      "MLS_EX::target_interpolation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        float tmp = 0;
+        for (int j = offsets(i); j < offsets(i + i); j++)
+        {
+          tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
+        }
+        target_values(i) = tmp;
+      });
 
   // Compute target values via evaluation
-  auto target_values_exact = Kokkos::View<float*, MemorySpace>(
-    "MLS_EX::target_values_exact", target_points_num);
+  auto target_values_exact = Kokkos::View<float *, MemorySpace>(
+      "MLS_EX::target_values_exact", target_points_num);
   Kokkos::parallel_for(
-    "MLS_EX::target_evaluation",
-    Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
-    KOKKOS_LAMBDA (const int i) {
-      target_values_exact(i) = manufactured_solution(target_points(i));
-  });
+      "MLS_EX::target_evaluation",
+      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      KOKKOS_LAMBDA(int const i) {
+        target_values_exact(i) = manufactured_solution(target_points(i));
+      });
 
   // Show difference
-  auto target_values_host =
-    Kokkos::create_mirror_view(target_values);
+  auto target_values_host = Kokkos::create_mirror_view(target_values);
   Kokkos::deep_copy(target_values_host, target_values);
   auto target_values_exact_host =
-    Kokkos::create_mirror_view(target_values_exact);
+      Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(target_values_exact_host, target_values_exact);
 
-  for (int i = 0; i < target_points_num; i++) {
+  for (int i = 0; i < target_points_num; i++)
+  {
     float error =
-      Kokkos::abs(target_values_host(i) - target_values_exact_host(i));
+        Kokkos::abs(target_values_host(i) - target_values_exact_host(i));
     std::cout << "==== Target " << i << '\n'
               << "Interpolation: " << target_values_host(i) << '\n'
               << "Real value   : " << target_values_exact_host(i) << '\n'

From 4f44e1c453c06cceb9b9161edecb9f5052dfc725 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 2 Aug 2023 11:46:57 -0400
Subject: [PATCH 09/44] Fixup (memory and kernel names, floats, error, exec
 space)

---
 examples/moving_least_squares/CMakeLists.txt  |   6 +-
 .../moving_least_squares.cpp                  | 119 +++++++++---------
 2 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/examples/moving_least_squares/CMakeLists.txt b/examples/moving_least_squares/CMakeLists.txt
index d9d9c6e45..a22bdbe30 100644
--- a/examples/moving_least_squares/CMakeLists.txt
+++ b/examples/moving_least_squares/CMakeLists.txt
@@ -1,3 +1,3 @@
-add_executable(ArborX_Example_MovingLeastSquare.exe moving_least_squares.cpp)
-target_link_libraries(ArborX_Example_MovingLeastSquare.exe ArborX::ArborX)
-add_test(NAME ArborX_Example_MovingLeastSquare COMMAND ArborX_Example_MovingLeastSquare.exe)
+add_executable(ArborX_Example_MovingLeastSquares.exe moving_least_squares.cpp)
+target_link_libraries(ArborX_Example_MovingLeastSquares.exe ArborX::ArborX)
+add_test(NAME ArborX_Example_MovingLeastSquares COMMAND ArborX_Example_MovingLeastSquares.exe)
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index af23ba5eb..847a9ac21 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -28,7 +28,7 @@ struct RBFWendland_0
   KOKKOS_INLINE_FUNCTION float operator()(float x)
   {
     x /= _radius;
-    return (1. - x) * (1. - x);
+    return (1.f - x) * (1.f - x);
   }
 
   float _radius;
@@ -41,7 +41,7 @@ struct MVPolynomialBasis_3D
   KOKKOS_INLINE_FUNCTION Kokkos::Array<float, size>
   operator()(ArborX::Point const &p) const
   {
-    return {{1., p[0], p[1], p[2]}};
+    return {{1.f, p[0], p[1], p[2]}};
   }
 };
 
@@ -59,19 +59,21 @@ int main(int argc, char *argv[])
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
 
+  ExecutionSpace space{};
+
   auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::source_points"),
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::source_points"),
       source_points_num);
   auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "MLS_EX::target_points"),
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::target_points"),
       target_points_num);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
   // Generate source points (Organized within a [-10, 10]^3 cube)
   Kokkos::parallel_for(
-      "MLS_EX::source_points_init",
+      "Example::source_points_init",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-          {0, 0, 0}, {cube_side, cube_side, cube_side}),
+          space, {0, 0, 0}, {cube_side, cube_side, cube_side}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         source_points(i * cube_side * cube_side + j * cube_side + k) =
             ArborX::Point{20.f * (float(i) / (cube_side - 1) - .5f),
@@ -87,32 +89,32 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(target_points, target_points_host);
 
   // Organize source points as tree
-  ArborX::BVH<MemorySpace> source_tree(ExecutionSpace{}, source_points);
+  ArborX::BVH<MemorySpace> source_tree(space, source_points);
 
   // Create the queries
   // For each target point we query the closest source points
   auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point> *, MemorySpace>(
-      "MLS_EX::queries", target_points_num);
+      "Example::queries", target_points_num);
   Kokkos::parallel_for(
-      "MLS_EX::make_queries",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::make_queries",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         queries(i) = ArborX::nearest(target_points(i), num_neighbors);
       });
 
   // Perform the query
-  auto indices = Kokkos::View<int *, MemorySpace>("MLS_EX::indices", 0);
-  auto offsets = Kokkos::View<int *, MemorySpace>("MLS_EX::offsets", 0);
-  source_tree.query(ExecutionSpace{}, queries, indices, offsets);
+  auto indices = Kokkos::View<int *, MemorySpace>("Example::indices", 0);
+  auto offsets = Kokkos::View<int *, MemorySpace>("Example::offsets", 0);
+  source_tree.query(space, queries, indices, offsets);
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
   // This is used as an optimisation later in the algorithm
   auto tr_source_points = Kokkos::View<ArborX::Point **, MemorySpace>(
-      "MLS_EX::tr_source_points", target_points_num, num_neighbors);
+      "Example::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-      "MLS_EX::transform_source_points",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::transform_source_points",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
@@ -126,49 +128,49 @@ int main(int argc, char *argv[])
 
   // Compute the radii for the weight (phi) vector
   auto radii =
-      Kokkos::View<float *, MemorySpace>("MLS_EX::radii", target_points_num);
+      Kokkos::View<float *, MemorySpace>("Example::radii", target_points_num);
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
-      "MLS_EX::radii_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::radii_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
-        float radius = 10. * epsilon;
+        float radius = 10.f * epsilon;
 
         for (int j = 0; j < num_neighbors; j++)
         {
           float norm = ArborX::Details::distance(tr_source_points(i, j),
-                                                 ArborX::Point{0., 0., 0.});
+                                                 ArborX::Point{0.f, 0.f, 0.f});
           radius = (radius < norm) ? norm : radius;
         }
 
-        radii(i) = 1.1 * radius;
+        radii(i) = 1.1f * radius;
       });
 
   // Compute the weight (phi) vector
   auto phi = Kokkos::View<float **, MemorySpace>(
-      "MLS_EX::phi", target_points_num, num_neighbors);
+      "Example::phi", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-      "MLS_EX::phi_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, phi.extent(0)),
+      "Example::phi_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         auto rbf = RBFWendland_0{radii(i)};
 
-        for (int j = 0; j < phi.extent(1); j++)
+        for (int j = 0; j < num_neighbors; j++)
         {
           float norm = ArborX::Details::distance(tr_source_points(i, j),
-                                                 ArborX::Point{0., 0., 0.});
+                                                 ArborX::Point{0.f, 0.f, 0.f});
           phi(i, j) = rbf(norm);
         }
       });
 
   // Compute multivariable Vandermonde (P) matrix
   auto p = Kokkos::View<float ***, MemorySpace>(
-      "MLS_EX::vandermonde", target_points_num, num_neighbors,
+      "Example::vandermonde", target_points_num, num_neighbors,
       MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-      "MLS_EX::vandermonde_computation",
+      "Example::vandermonde_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
-          {0, 0}, {target_points_num, num_neighbors}),
+          space, {0, 0}, {target_points_num, num_neighbors}),
       KOKKOS_LAMBDA(int const i, int const j) {
         auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
 
@@ -179,14 +181,15 @@ int main(int argc, char *argv[])
       });
 
   // Compute moment (A) matrix
-  auto a = Kokkos::View<float ***, MemorySpace>("MLS_EX::A", target_points_num,
+  auto a = Kokkos::View<float ***, MemorySpace>("Example::A", target_points_num,
                                                 MVPolynomialBasis_3D::size,
                                                 MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-      "MLS_EX::A_computation",
+      "Example::A_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-          {0, 0, 0}, {target_points_num, MVPolynomialBasis_3D::size,
-                      MVPolynomialBasis_3D::size}),
+          space, {0, 0, 0},
+          {target_points_num, MVPolynomialBasis_3D::size,
+           MVPolynomialBasis_3D::size}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         float tmp = 0;
         for (int l = 0; l < num_neighbors; l++)
@@ -203,17 +206,17 @@ int main(int argc, char *argv[])
   // Kind of works, errors out quite often.
   // A better method should be employed (SVD?)
   auto a_inv = Kokkos::View<float ***, MemorySpace>(
-      "MLS_EX::A_inv", target_points_num, MVPolynomialBasis_3D::size,
+      "Example::A_inv", target_points_num, MVPolynomialBasis_3D::size,
       MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
-      "MLS_EX::A_inv_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::A_inv_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
         {
           for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
           {
-            a_inv(i, j, k) = (j == k) * 1.;
+            a_inv(i, j, k) = (j == k) * 1.f;
           }
         }
 
@@ -225,7 +228,7 @@ int main(int argc, char *argv[])
           int k = j;
           for (; k < MVPolynomialBasis_3D::size; k++)
           {
-            if (a(i, k, j) != 0.0)
+            if (a(i, k, j) != 0.f)
               break;
           }
 
@@ -255,7 +258,7 @@ int main(int argc, char *argv[])
           // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
           for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
           {
-            if (l == j || a(i, l, j) == 0.0)
+            if (l == j || a(i, l, j) == 0.f)
               continue;
             float mul = a(i, l, j);
 
@@ -264,7 +267,7 @@ int main(int argc, char *argv[])
               a(i, l, m) -= mul * a(i, j, m);
               a_inv(i, l, m) -= mul * a_inv(i, j, m);
             }
-            a(i, l, j) = 0.0;
+            a(i, l, j) = 0.f;
           }
 
           // Now a_inv should contain the inverse of a
@@ -273,11 +276,11 @@ int main(int argc, char *argv[])
 
   // Compute the coefficients
   auto coeffs = Kokkos::View<float **, MemorySpace>(
-      "MLS_EX::coefficients", target_points_num, num_neighbors);
+      "Example::coefficients", target_points_num, num_neighbors);
   Kokkos::parallel_for(
-      "MLS_EX::coefficients_computation",
+      "Example::coefficients_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
-          {0, 0}, {target_points_num, num_neighbors}),
+          space, {0, 0}, {target_points_num, num_neighbors}),
       KOKKOS_LAMBDA(int const i, int const j) {
         float tmp = 0;
 
@@ -291,20 +294,20 @@ int main(int argc, char *argv[])
 
   // Compute source values
   auto source_values = Kokkos::View<float *, MemorySpace>(
-      "MLS_EX::source_values", source_points_num);
+      "Example::source_values", source_points_num);
   Kokkos::parallel_for(
-      "MLS_EX::source_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, source_points_num),
+      "Example::source_evaluation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, source_points_num),
       KOKKOS_LAMBDA(int const i) {
         source_values(i) = manufactured_solution(source_points(i));
       });
 
   // Compute target values via interpolation
   auto target_values = Kokkos::View<float *, MemorySpace>(
-      "MLS_EX::target_values", target_points_num);
+      "Example::target_values", target_points_num);
   Kokkos::parallel_for(
-      "MLS_EX::target_interpolation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::target_interpolation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         float tmp = 0;
         for (int j = offsets(i); j < offsets(i + i); j++)
@@ -316,10 +319,10 @@ int main(int argc, char *argv[])
 
   // Compute target values via evaluation
   auto target_values_exact = Kokkos::View<float *, MemorySpace>(
-      "MLS_EX::target_values_exact", target_points_num);
+      "Example::target_values_exact", target_points_num);
   Kokkos::parallel_for(
-      "MLS_EX::target_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(0, target_points_num),
+      "Example::target_evaluation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         target_values_exact(i) = manufactured_solution(target_points(i));
       });
@@ -331,13 +334,15 @@ int main(int argc, char *argv[])
       Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(target_values_exact_host, target_values_exact);
 
+  float error = 0.f;
   for (int i = 0; i < target_points_num; i++)
   {
-    float error =
-        Kokkos::abs(target_values_host(i) - target_values_exact_host(i));
+    error = Kokkos::max(
+        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)),
+        error);
     std::cout << "==== Target " << i << '\n'
               << "Interpolation: " << target_values_host(i) << '\n'
-              << "Real value   : " << target_values_exact_host(i) << '\n'
-              << "Absolute err.: " << error << "\n====\n";
+              << "Real value   : " << target_values_exact_host(i) << '\n';
   }
+  std::cout << "====\nMaximum error: " << error << std::endl;
 }

From 15b150bc3b275cef382791a3d48ca42f3bc2d278 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 2 Aug 2023 13:35:41 -0400
Subject: [PATCH 10/44] Modifies predicates array into AccessTraits

---
 .../moving_least_squares.cpp                  | 38 ++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 847a9ac21..4440e8ef7 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -45,6 +45,28 @@ struct MVPolynomialBasis_3D
   }
 };
 
+struct TargetPoints
+{
+  Kokkos::View<ArborX::Point *, MemorySpace> target_points;
+  std::size_t num_neighbors;
+};
+
+template <>
+struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
+{
+  static KOKKOS_FUNCTION std::size_t size(TargetPoints const &tp)
+  {
+    return tp.target_points.extent(0);
+  }
+
+  static KOKKOS_FUNCTION auto get(TargetPoints const &tp, std::size_t i)
+  {
+    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
+  }
+
+  using memory_space = MemorySpace;
+};
+
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
@@ -54,7 +76,7 @@ KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
-  constexpr std::size_t num_neighbors = 5;
+  constexpr std::size_t num_neighbors = 7;
   constexpr std::size_t cube_side = 4;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
@@ -91,21 +113,11 @@ int main(int argc, char *argv[])
   // Organize source points as tree
   ArborX::BVH<MemorySpace> source_tree(space, source_points);
 
-  // Create the queries
-  // For each target point we query the closest source points
-  auto queries = Kokkos::View<ArborX::Nearest<ArborX::Point> *, MemorySpace>(
-      "Example::queries", target_points_num);
-  Kokkos::parallel_for(
-      "Example::make_queries",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        queries(i) = ArborX::nearest(target_points(i), num_neighbors);
-      });
-
   // Perform the query
   auto indices = Kokkos::View<int *, MemorySpace>("Example::indices", 0);
   auto offsets = Kokkos::View<int *, MemorySpace>("Example::offsets", 0);
-  source_tree.query(space, queries, indices, offsets);
+  source_tree.query(space, TargetPoints{target_points, num_neighbors}, indices,
+                    offsets);
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.

From d69f348ad305907b4014b975fe4f46d34d115cb7 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 2 Aug 2023 13:42:46 -0400
Subject: [PATCH 11/44] Correct declaration convention

---
 .../moving_least_squares.cpp                  | 47 +++++++++----------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 4440e8ef7..182c66e90 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -83,10 +83,10 @@ int main(int argc, char *argv[])
 
   ExecutionSpace space{};
 
-  auto source_points = Kokkos::View<ArborX::Point *, MemorySpace>(
+  Kokkos::View<ArborX::Point *, MemorySpace> source_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::source_points"),
       source_points_num);
-  auto target_points = Kokkos::View<ArborX::Point *, MemorySpace>(
+  Kokkos::View<ArborX::Point *, MemorySpace> target_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::target_points"),
       target_points_num);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
@@ -114,15 +114,15 @@ int main(int argc, char *argv[])
   ArborX::BVH<MemorySpace> source_tree(space, source_points);
 
   // Perform the query
-  auto indices = Kokkos::View<int *, MemorySpace>("Example::indices", 0);
-  auto offsets = Kokkos::View<int *, MemorySpace>("Example::offsets", 0);
+  Kokkos::View<int *, MemorySpace> indices("Example::indices", 0);
+  Kokkos::View<int *, MemorySpace> offsets("Example::offsets", 0);
   source_tree.query(space, TargetPoints{target_points, num_neighbors}, indices,
                     offsets);
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
   // This is used as an optimisation later in the algorithm
-  auto tr_source_points = Kokkos::View<ArborX::Point **, MemorySpace>(
+  Kokkos::View<ArborX::Point **, MemorySpace> tr_source_points(
       "Example::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
       "Example::transform_source_points",
@@ -139,8 +139,7 @@ int main(int argc, char *argv[])
       });
 
   // Compute the radii for the weight (phi) vector
-  auto radii =
-      Kokkos::View<float *, MemorySpace>("Example::radii", target_points_num);
+  Kokkos::View<float *, MemorySpace> radii("Example::radii", target_points_num);
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
       "Example::radii_computation",
@@ -159,13 +158,13 @@ int main(int argc, char *argv[])
       });
 
   // Compute the weight (phi) vector
-  auto phi = Kokkos::View<float **, MemorySpace>(
-      "Example::phi", target_points_num, num_neighbors);
+  Kokkos::View<float **, MemorySpace> phi("Example::phi", target_points_num,
+                                          num_neighbors);
   Kokkos::parallel_for(
       "Example::phi_computation",
       Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
-        auto rbf = RBFWendland_0{radii(i)};
+        RBFWendland_0 rbf{radii(i)};
 
         for (int j = 0; j < num_neighbors; j++)
         {
@@ -176,9 +175,9 @@ int main(int argc, char *argv[])
       });
 
   // Compute multivariable Vandermonde (P) matrix
-  auto p = Kokkos::View<float ***, MemorySpace>(
-      "Example::vandermonde", target_points_num, num_neighbors,
-      MVPolynomialBasis_3D::size);
+  Kokkos::View<float ***, MemorySpace> p("Example::vandermonde",
+                                         target_points_num, num_neighbors,
+                                         MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
       "Example::vandermonde_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
@@ -193,9 +192,9 @@ int main(int argc, char *argv[])
       });
 
   // Compute moment (A) matrix
-  auto a = Kokkos::View<float ***, MemorySpace>("Example::A", target_points_num,
-                                                MVPolynomialBasis_3D::size,
-                                                MVPolynomialBasis_3D::size);
+  Kokkos::View<float ***, MemorySpace> a("Example::A", target_points_num,
+                                         MVPolynomialBasis_3D::size,
+                                         MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
       "Example::A_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
@@ -217,7 +216,7 @@ int main(int argc, char *argv[])
   // first one are applied to the second
   // Kind of works, errors out quite often.
   // A better method should be employed (SVD?)
-  auto a_inv = Kokkos::View<float ***, MemorySpace>(
+  Kokkos::View<float ***, MemorySpace> a_inv(
       "Example::A_inv", target_points_num, MVPolynomialBasis_3D::size,
       MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
@@ -287,8 +286,8 @@ int main(int argc, char *argv[])
       });
 
   // Compute the coefficients
-  auto coeffs = Kokkos::View<float **, MemorySpace>(
-      "Example::coefficients", target_points_num, num_neighbors);
+  Kokkos::View<float **, MemorySpace> coeffs("Example::coefficients",
+                                             target_points_num, num_neighbors);
   Kokkos::parallel_for(
       "Example::coefficients_computation",
       Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
@@ -305,8 +304,8 @@ int main(int argc, char *argv[])
       });
 
   // Compute source values
-  auto source_values = Kokkos::View<float *, MemorySpace>(
-      "Example::source_values", source_points_num);
+  Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
+                                                   source_points_num);
   Kokkos::parallel_for(
       "Example::source_evaluation",
       Kokkos::RangePolicy<ExecutionSpace>(space, 0, source_points_num),
@@ -315,8 +314,8 @@ int main(int argc, char *argv[])
       });
 
   // Compute target values via interpolation
-  auto target_values = Kokkos::View<float *, MemorySpace>(
-      "Example::target_values", target_points_num);
+  Kokkos::View<float *, MemorySpace> target_values("Example::target_values",
+                                                   target_points_num);
   Kokkos::parallel_for(
       "Example::target_interpolation",
       Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
@@ -330,7 +329,7 @@ int main(int argc, char *argv[])
       });
 
   // Compute target values via evaluation
-  auto target_values_exact = Kokkos::View<float *, MemorySpace>(
+  Kokkos::View<float *, MemorySpace> target_values_exact(
       "Example::target_values_exact", target_points_num);
   Kokkos::parallel_for(
       "Example::target_evaluation",

From f6cd686882a2d99119a07ecee169237521f104f5 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 2 Aug 2023 15:15:11 -0400
Subject: [PATCH 12/44] Typo fix, execution spaces in deep copies and range
 policies simplification

---
 .../moving_least_squares.cpp                  | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 182c66e90..9962c0b21 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -36,12 +36,13 @@ struct RBFWendland_0
 
 struct MVPolynomialBasis_3D
 {
-  static constexpr std::size_t size = 4;
+  static constexpr std::size_t size = 10;
 
   KOKKOS_INLINE_FUNCTION Kokkos::Array<float, size>
   operator()(ArborX::Point const &p) const
   {
-    return {{1.f, p[0], p[1], p[2]}};
+    return {{1.f, p[0], p[1], p[2], p[0] * p[0], p[0] * p[1], p[0] * p[2],
+             p[1] * p[1], p[1] * p[2], p[2] * p[2]}};
   }
 };
 
@@ -70,13 +71,13 @@ struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
-  return p[2] + p[0];
+  return p[2] * p[1] + p[0];
 }
 
 int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
-  constexpr std::size_t num_neighbors = 7;
+  constexpr std::size_t num_neighbors = 10;
   constexpr std::size_t cube_side = 4;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
@@ -94,8 +95,8 @@ int main(int argc, char *argv[])
   // Generate source points (Organized within a [-10, 10]^3 cube)
   Kokkos::parallel_for(
       "Example::source_points_init",
-      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-          space, {0, 0, 0}, {cube_side, cube_side, cube_side}),
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                             {cube_side, cube_side, cube_side}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         source_points(i * cube_side * cube_side + j * cube_side + k) =
             ArborX::Point{20.f * (float(i) / (cube_side - 1) - .5f),
@@ -104,11 +105,11 @@ int main(int argc, char *argv[])
       });
 
   // Generate target points
-  target_points_host(0) = ArborX::Point{0.f, 0.f, 0.f};
+  target_points_host(0) = ArborX::Point{1.f, 0.f, 1.f};
   target_points_host(1) = ArborX::Point{5.f, 5.f, 5.f};
   target_points_host(2) = ArborX::Point{-5.f, 5.f, 3.f};
   target_points_host(3) = ArborX::Point{1.f, -3.3f, 7.f};
-  Kokkos::deep_copy(target_points, target_points_host);
+  Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Organize source points as tree
   ArborX::BVH<MemorySpace> source_tree(space, source_points);
@@ -126,7 +127,7 @@ int main(int argc, char *argv[])
       "Example::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
       "Example::transform_source_points",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
@@ -143,7 +144,7 @@ int main(int argc, char *argv[])
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
       "Example::radii_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         float radius = 10.f * epsilon;
 
@@ -162,7 +163,7 @@ int main(int argc, char *argv[])
                                           num_neighbors);
   Kokkos::parallel_for(
       "Example::phi_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         RBFWendland_0 rbf{radii(i)};
 
@@ -180,7 +181,7 @@ int main(int argc, char *argv[])
                                          MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
       "Example::vandermonde_computation",
-      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+      Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
           space, {0, 0}, {target_points_num, num_neighbors}),
       KOKKOS_LAMBDA(int const i, int const j) {
         auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
@@ -197,10 +198,10 @@ int main(int argc, char *argv[])
                                          MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
       "Example::A_computation",
-      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-          space, {0, 0, 0},
-          {target_points_num, MVPolynomialBasis_3D::size,
-           MVPolynomialBasis_3D::size}),
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                             {target_points_num,
+                                              MVPolynomialBasis_3D::size,
+                                              MVPolynomialBasis_3D::size}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         float tmp = 0;
         for (int l = 0; l < num_neighbors; l++)
@@ -221,7 +222,7 @@ int main(int argc, char *argv[])
       MVPolynomialBasis_3D::size);
   Kokkos::parallel_for(
       "Example::A_inv_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
         {
@@ -290,7 +291,7 @@ int main(int argc, char *argv[])
                                              target_points_num, num_neighbors);
   Kokkos::parallel_for(
       "Example::coefficients_computation",
-      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+      Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
           space, {0, 0}, {target_points_num, num_neighbors}),
       KOKKOS_LAMBDA(int const i, int const j) {
         float tmp = 0;
@@ -308,7 +309,7 @@ int main(int argc, char *argv[])
                                                    source_points_num);
   Kokkos::parallel_for(
       "Example::source_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, source_points_num),
+      Kokkos::RangePolicy(space, 0, source_points_num),
       KOKKOS_LAMBDA(int const i) {
         source_values(i) = manufactured_solution(source_points(i));
       });
@@ -318,10 +319,10 @@ int main(int argc, char *argv[])
                                                    target_points_num);
   Kokkos::parallel_for(
       "Example::target_interpolation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         float tmp = 0;
-        for (int j = offsets(i); j < offsets(i + i); j++)
+        for (int j = offsets(i); j < offsets(i + 1); j++)
         {
           tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
         }
@@ -333,17 +334,17 @@ int main(int argc, char *argv[])
       "Example::target_values_exact", target_points_num);
   Kokkos::parallel_for(
       "Example::target_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         target_values_exact(i) = manufactured_solution(target_points(i));
       });
 
   // Show difference
   auto target_values_host = Kokkos::create_mirror_view(target_values);
-  Kokkos::deep_copy(target_values_host, target_values);
+  Kokkos::deep_copy(space, target_values_host, target_values);
   auto target_values_exact_host =
       Kokkos::create_mirror_view(target_values_exact);
-  Kokkos::deep_copy(target_values_exact_host, target_values_exact);
+  Kokkos::deep_copy(space, target_values_exact_host, target_values_exact);
 
   float error = 0.f;
   for (int i = 0; i < target_points_num; i++)

From 793a5f228858c972fcb88771063d3250cb5fc4ca Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 3 Aug 2023 16:26:57 -0400
Subject: [PATCH 13/44] Switching from gaussian inverse to SVD

---
 .../moving_least_squares.cpp                  | 167 +++++++++++++-----
 1 file changed, 122 insertions(+), 45 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 9962c0b21..8daa56a9e 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -18,6 +18,7 @@
 
 #include <Kokkos_Core.hpp>
 
+#include <cmath>
 #include <limits>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
@@ -77,8 +78,10 @@ KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 int main(int argc, char *argv[])
 {
   Kokkos::ScopeGuard guard(argc, argv);
-  constexpr std::size_t num_neighbors = 10;
-  constexpr std::size_t cube_side = 4;
+
+  constexpr float epsilon = std::numeric_limits<float>::epsilon();
+  constexpr std::size_t num_neighbors = 20;
+  constexpr std::size_t cube_side = 10;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
 
@@ -141,7 +144,6 @@ int main(int argc, char *argv[])
 
   // Compute the radii for the weight (phi) vector
   Kokkos::View<float *, MemorySpace> radii("Example::radii", target_points_num);
-  constexpr float epsilon = std::numeric_limits<float>::epsilon();
   Kokkos::parallel_for(
       "Example::radii_computation",
       Kokkos::RangePolicy(space, 0, target_points_num),
@@ -212,14 +214,22 @@ int main(int argc, char *argv[])
         a(i, j, k) = tmp;
       });
 
-  // Inverse moment matrix
-  // Gaussian inverse method. Both matrix are used and modifications on the
-  // first one are applied to the second
-  // Kind of works, errors out quite often.
-  // A better method should be employed (SVD?)
+  // Pseudo-inverse moment matrix using SVD
+  // We must find U, E (diagonal and positive) and V such that A = U.E.V^T
+  // We also know that A is symmetric (by construction), so U = SV where S is
+  // a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
+  // Thus A = U.E.S.U^T
+  static constexpr float pi_4 = M_PI_4;
   Kokkos::View<float ***, MemorySpace> a_inv(
       "Example::A_inv", target_points_num, MVPolynomialBasis_3D::size,
       MVPolynomialBasis_3D::size);
+  Kokkos::View<float ***, MemorySpace> svd_u(
+      "Example::SVD::U", target_points_num, MVPolynomialBasis_3D::size,
+      MVPolynomialBasis_3D::size);
+  Kokkos::View<float ***, MemorySpace> svd_es(
+      "Example::SVD::E.S", target_points_num, MVPolynomialBasis_3D::size,
+      MVPolynomialBasis_3D::size);
+  Kokkos::deep_copy(space, svd_es, a);
   Kokkos::parallel_for(
       "Example::A_inv_computation",
       Kokkos::RangePolicy(space, 0, target_points_num),
@@ -228,61 +238,128 @@ int main(int argc, char *argv[])
         {
           for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
           {
-            a_inv(i, j, k) = (j == k) * 1.f;
+            svd_u(i, j, k) = (j == k) * 1.f;
           }
         }
 
-        // This needs to be done for every column
-        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-        {
-
-          // We find the line with a non-negative element on column j
-          int k = j;
-          for (; k < MVPolynomialBasis_3D::size; k++)
+        // This finds the biggest off-diagonal value of E.S as well as its
+        // coordinates. Being symmetric, we can always check on the upper
+        // triangle (and always have q > p)
+        auto argmax = [=](int &p, int &q) {
+          float max = 0.f;
+          p = -1;
+          q = -1;
+          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
           {
-            if (a(i, k, j) != 0.f)
-              break;
+            for (int k = j + 1; k < MVPolynomialBasis_3D::size; k++)
+            {
+              float val = Kokkos::abs(svd_es(i, j, k));
+              if (max < val)
+              {
+                max = val;
+                p = j;
+                q = k;
+              }
+            }
           }
 
-          // We divide the line with said value
-          float tmp = a(i, k, j);
-          for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
+          return max;
+        };
+
+        // Iterative approach, we will "deconstruct" E.S until only the diagonal
+        // is relevent inside the matrix
+        // It is possible to prove that, at each step, the "norm" of the matrix
+        // is strictly less that of the previous
+        int p, q;
+        float norm = argmax(p, q);
+        while (norm > epsilon)
+        {
+          // Our submatrix is now
+          // +----------+----------+   +---+---+
+          // | es(p, p) | es(p, q) |   | a | b |
+          // +----------+----------+ = +---+---+
+          // | es(q, p) | es(q, q) |   | b | c |
+          // +----------+----------+   +---+---+
+          float a = svd_es(i, p, p);
+          float b = svd_es(i, p, q);
+          float c = svd_es(i, q, q);
+
+          float theta, u, v;
+          if (a == c)
           {
-            a(i, k, l) /= tmp;
-            a_inv(i, k, l) /= tmp;
+            theta = pi_4;
+            u = a + b;
+            v = a - b;
           }
+          else
+          {
+            theta = .5f * Kokkos::atanf((2.f * b) / (a - c));
+            float cos2 = Kokkos::cosf(2.f * theta);
+            u = .5f * (a + c + (a - c) / cos2);
+            v = .5f * (a + c - (a - c) / cos2);
+          }
+          float cos = Kokkos::cosf(theta);
+          float sin = Kokkos::sinf(theta);
 
-          // If line and column are not the same, move the column to the top
-          if (k != j)
+          // We must now apply the rotation matrix to the left
+          // and right of E.S and on the right of U
+
+          // Left of E.S (mult by R(theta)^T)
+          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
           {
-            for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
-            {
-              float tmp = a(i, k, l);
-              a(i, k, l) = a(i, j, l);
-              a(i, j, l) = tmp;
+            float es_ipj = svd_es(i, p, j);
+            float es_iqj = svd_es(i, q, j);
+            svd_es(i, p, j) = cos * es_ipj + sin * es_iqj;
+            svd_es(i, q, j) = -sin * es_ipj + cos * es_iqj;
+          }
 
-              tmp = a_inv(i, k, l);
-              a_inv(i, k, l) = a_inv(i, j, l);
-              a_inv(i, j, l) = tmp;
-            }
+          // Right of E.S (mult by R(theta))
+          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
+          {
+            float es_ijp = svd_es(i, j, p);
+            float es_ijq = svd_es(i, j, q);
+            svd_es(i, j, p) = cos * es_ijp + sin * es_ijq;
+            svd_es(i, j, q) = -sin * es_ijp + cos * es_ijq;
           }
 
-          // Now, set at zero all other elements of the column (Ll <- Ll - a*Lj)
-          for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
+          // Right of U (mult by R(theta))
+          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
           {
-            if (l == j || a(i, l, j) == 0.f)
-              continue;
-            float mul = a(i, l, j);
+            float u_ijp = svd_u(i, j, p);
+            float u_ijq = svd_u(i, j, q);
+            svd_u(i, j, p) = cos * u_ijp + sin * u_ijq;
+            svd_u(i, j, q) = -sin * u_ijp + cos * u_ijq;
+          }
+
+          // These should theorically hold but is it ok to force them to their
+          // real value?
+          svd_es(i, p, p) = u;
+          svd_es(i, q, q) = v;
+          svd_es(i, p, q) = 0.f;
+          svd_es(i, q, p) = 0.f;
+
+          norm = argmax(p, q);
+        }
 
-            for (int m = 0; m < MVPolynomialBasis_3D::size; m++)
+        // We should now have a correct U and E.S
+        // We'll compute the pseudo inverse of A by taking the
+        // pseudo inverse of E.S which is simply inverting the diagonal of
+        // E.S
+        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
+        {
+          for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
+          {
+            float value = 0.;
+            for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
             {
-              a(i, l, m) -= mul * a(i, j, m);
-              a_inv(i, l, m) -= mul * a_inv(i, j, m);
+              if (Kokkos::abs(svd_es(i, l, l)) >= epsilon)
+              {
+                value += svd_u(i, j, l) * svd_u(i, l, k) / svd_es(i, l, l);
+              }
             }
-            a(i, l, j) = 0.f;
-          }
 
-          // Now a_inv should contain the inverse of a
+            a_inv(i, j, k) = value;
+          }
         }
       });
 

From 15ca7a18cb4449acec1ff440381c989d122c0131 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 4 Aug 2023 11:00:06 -0400
Subject: [PATCH 14/44] Specifying ExecutionSpace in RangePolicies

---
 .../moving_least_squares/moving_least_squares.cpp  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 8daa56a9e..ba563753e 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
       "Example::tr_source_points", target_points_num, num_neighbors);
   Kokkos::parallel_for(
       "Example::transform_source_points",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
@@ -146,7 +146,7 @@ int main(int argc, char *argv[])
   Kokkos::View<float *, MemorySpace> radii("Example::radii", target_points_num);
   Kokkos::parallel_for(
       "Example::radii_computation",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         float radius = 10.f * epsilon;
 
@@ -165,7 +165,7 @@ int main(int argc, char *argv[])
                                           num_neighbors);
   Kokkos::parallel_for(
       "Example::phi_computation",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         RBFWendland_0 rbf{radii(i)};
 
@@ -232,7 +232,7 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, svd_es, a);
   Kokkos::parallel_for(
       "Example::A_inv_computation",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
         {
@@ -386,7 +386,7 @@ int main(int argc, char *argv[])
                                                    source_points_num);
   Kokkos::parallel_for(
       "Example::source_evaluation",
-      Kokkos::RangePolicy(space, 0, source_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, source_points_num),
       KOKKOS_LAMBDA(int const i) {
         source_values(i) = manufactured_solution(source_points(i));
       });
@@ -396,7 +396,7 @@ int main(int argc, char *argv[])
                                                    target_points_num);
   Kokkos::parallel_for(
       "Example::target_interpolation",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         float tmp = 0;
         for (int j = offsets(i); j < offsets(i + 1); j++)
@@ -411,7 +411,7 @@ int main(int argc, char *argv[])
       "Example::target_values_exact", target_points_num);
   Kokkos::parallel_for(
       "Example::target_evaluation",
-      Kokkos::RangePolicy(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
       KOKKOS_LAMBDA(int const i) {
         target_values_exact(i) = manufactured_solution(target_points(i));
       });

From 62de5ed3c817e9971ee53cd711cd6e3efc3e2d69 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Mon, 7 Aug 2023 11:51:24 -0400
Subject: [PATCH 15/44] Fixing wrong SVD calculation

---
 .../moving_least_squares/moving_least_squares.cpp    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index ba563753e..046f17bf7 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -72,7 +72,7 @@ struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
-  return p[2] * p[1] + p[0];
+  return Kokkos::sin(p[0]) * p[2] + p[1];
 }
 
 int main(int argc, char *argv[])
@@ -80,7 +80,7 @@ int main(int argc, char *argv[])
   Kokkos::ScopeGuard guard(argc, argv);
 
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
-  constexpr std::size_t num_neighbors = 20;
+  constexpr std::size_t num_neighbors = MVPolynomialBasis_3D::size;
   constexpr std::size_t cube_side = 10;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
@@ -344,17 +344,17 @@ int main(int argc, char *argv[])
         // We should now have a correct U and E.S
         // We'll compute the pseudo inverse of A by taking the
         // pseudo inverse of E.S which is simply inverting the diagonal of
-        // E.S
+        // E.S. We have pseudoA = U^T.pseudoES.U
         for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
         {
           for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
           {
-            float value = 0.;
+            float value = 0.f;
             for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
             {
               if (Kokkos::abs(svd_es(i, l, l)) >= epsilon)
               {
-                value += svd_u(i, j, l) * svd_u(i, l, k) / svd_es(i, l, l);
+                value += svd_u(i, j, l) * svd_u(i, k, l) / svd_es(i, l, l);
               }
             }
 
@@ -371,7 +371,7 @@ int main(int argc, char *argv[])
       Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
           space, {0, 0}, {target_points_num, num_neighbors}),
       KOKKOS_LAMBDA(int const i, int const j) {
-        float tmp = 0;
+        float tmp = 0.f;
 
         for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
         {

From e7f7918d8a8981278ec60c083162a3afb3b10347 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 8 Aug 2023 15:25:25 -0400
Subject: [PATCH 16/44] Adding MPI (unstable)

---
 .../moving_least_squares.cpp                  | 192 ++++++++++++++++--
 1 file changed, 172 insertions(+), 20 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 046f17bf7..72a046320 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -20,9 +20,13 @@
 
 #include <cmath>
 #include <limits>
+#include <sstream>
+
+#include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
+using DeviceSpace = Kokkos::Device<ExecutionSpace, MemorySpace>;
 
 struct RBFWendland_0
 {
@@ -69,6 +73,23 @@ struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
   using memory_space = MemorySpace;
 };
 
+/*
+0: ==== Target 0
+0: Interpolation: 0.717408
+0: Real value   : 0.841471
+0: ==== Target 1
+0: Interpolation: 0.210617
+0: Real value   : 0.205379
+0: ==== Target 2
+0: Interpolation: 7.36529
+0: Real value   : 7.87677
+0: ==== Target 3
+0: Interpolation: 1.41947
+0: Real value   : 2.5903
+0: ====
+0: Maximum error: 1.17083
+*/
+
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
@@ -77,19 +98,26 @@ KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 
 int main(int argc, char *argv[])
 {
+  MPI_Init(&argc, &argv);
   Kokkos::ScopeGuard guard(argc, argv);
 
   constexpr float epsilon = std::numeric_limits<float>::epsilon();
   constexpr std::size_t num_neighbors = MVPolynomialBasis_3D::size;
-  constexpr std::size_t cube_side = 10;
+  constexpr std::size_t cube_side = 20;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
 
   ExecutionSpace space{};
+  MPI_Comm mpi_comm = MPI_COMM_WORLD;
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(mpi_comm, &mpi_size);
+  MPI_Comm_rank(mpi_comm, &mpi_rank);
+
+  std::size_t local_source_points_num = source_points_num / mpi_size;
 
   Kokkos::View<ArborX::Point *, MemorySpace> source_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::source_points"),
-      source_points_num);
+      local_source_points_num);
   Kokkos::View<ArborX::Point *, MemorySpace> target_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::target_points"),
       target_points_num);
@@ -98,13 +126,14 @@ int main(int argc, char *argv[])
   // Generate source points (Organized within a [-10, 10]^3 cube)
   Kokkos::parallel_for(
       "Example::source_points_init",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                             {cube_side, cube_side, cube_side}),
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {cube_side, cube_side, cube_side / mpi_size}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         source_points(i * cube_side * cube_side + j * cube_side + k) =
             ArborX::Point{20.f * (float(i) / (cube_side - 1) - .5f),
                           20.f * (float(j) / (cube_side - 1) - .5f),
-                          20.f * (float(k) / (cube_side - 1) - .5f)};
+                          20.f * (float(k) / (cube_side - 1) - .5f +
+                                  (float(mpi_rank) / mpi_size))};
       });
 
   // Generate target points
@@ -115,13 +144,102 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Organize source points as tree
-  ArborX::BVH<MemorySpace> source_tree(space, source_points);
+  ArborX::DistributedTree<MemorySpace> source_tree(mpi_comm, space,
+                                                   source_points);
 
-  // Perform the query
-  Kokkos::View<int *, MemorySpace> indices("Example::indices", 0);
+  // Perform the query and split the indices/ranks
+  Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
+      "Example::index_ranks", 0);
   Kokkos::View<int *, MemorySpace> offsets("Example::offsets", 0);
-  source_tree.query(space, TargetPoints{target_points, num_neighbors}, indices,
-                    offsets);
+  source_tree.query(space, TargetPoints{target_points, num_neighbors},
+                    index_ranks, offsets);
+  Kokkos::View<int *, MemorySpace> local_indices(
+      "Example::local_indices", target_points_num * num_neighbors);
+  Kokkos::View<int *, MemorySpace> local_ranks(
+      "Example::local_ranks", target_points_num * num_neighbors);
+  Kokkos::parallel_for(
+      "Example::index_ranks_split",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0,
+                                          target_points_num * num_neighbors),
+      KOKKOS_LAMBDA(int const i) {
+        local_indices(i) = index_ranks(i).first;
+        local_ranks(i) = index_ranks(i).second;
+      });
+
+  // Before moving on, we must gather the coordinates of all the requested
+  // source points. DTK does that by distributing in a "who wants what" matter
+  // The distribution is done in two phases. A first pass where every process
+  // receives the information on "who wants what" from them. Then a second pass
+  // is done where values are set up and sent back to processes
+
+  // First pass setup
+  ArborX::Details::Distributor<DeviceSpace> distributor_first(mpi_comm);
+  int const local_requests_num =
+      distributor_first.createFromSends(space, local_ranks);
+
+  // "Middlemen" buffers
+  // - mpi_mid_in_indices(i) corresponds to an index that zill be used to
+  // construct the final value
+  // - mpi_mid_rank(i) corresponds to the request origin for value (i)
+  // - mpi_mid_indices(i) corresponds to the point's index in the nn query
+  // from which mpi_mid_points(i) is attached to
+  Kokkos::View<int *, MemorySpace> mpi_mid_in_indices(
+      "Example::mpi_mid_in_indices", local_requests_num);
+  Kokkos::View<int *, MemorySpace> mpi_mid_indices("Example::mpi_mid_indices",
+                                                   local_requests_num);
+  Kokkos::View<int *, MemorySpace> mpi_mid_ranks("Example::mpi_mid_ranks",
+                                                 local_requests_num);
+  Kokkos::View<ArborX::Point *, MemorySpace> mpi_mid_points(
+      "Example::mpi_mid_points", local_requests_num);
+
+  // First pass comms
+  Kokkos::View<int *, MemorySpace> mpi_tmp("Example::mpi_tmp",
+                                           target_points_num * num_neighbors);
+  ArborX::iota(space, mpi_tmp);
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_first, mpi_tmp, mpi_mid_in_indices);
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_first, local_indices, mpi_mid_indices);
+  Kokkos::deep_copy(space, mpi_tmp, mpi_rank);
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_first, mpi_tmp, mpi_mid_ranks);
+  Kokkos::parallel_for(
+      "Example::mpi_mid_points_fill",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_requests_num),
+      KOKKOS_LAMBDA(int const i) {
+        mpi_mid_points(i) = source_points(mpi_mid_indices(i));
+      });
+
+  // This process now knows "who wants what" and is ready to send everything
+  // back
+
+  // Second pass setup
+  ArborX::Details::Distributor<DeviceSpace> distributor_second(mpi_comm);
+  int const local_responses_num =
+      distributor_second.createFromSends(space, mpi_mid_ranks);
+  Kokkos::View<ArborX::Point *, MemorySpace> local_untreated_source_points(
+      "Example::local_untreated_source_points",
+      target_points_num * num_neighbors);
+  // We have local_responses_num == target_points_num * num_neighbors
+
+  // Temporary buffers
+  Kokkos::View<int *, MemorySpace> mpi_tmp_in_indices(
+      "Examples::mpi_tmp_in_indices", local_responses_num);
+  Kokkos::View<ArborX::Point *, MemorySpace> mpi_tmp_points(
+      "Examples::mpi_tmp_points", local_responses_num);
+
+  // Second pass comms
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_second, mpi_mid_points, mpi_tmp_points);
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_second, mpi_mid_in_indices, mpi_tmp_in_indices);
+  Kokkos::parallel_for(
+      "Example::local_untreated_source_points_fill",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_responses_num),
+      KOKKOS_LAMBDA(int const i) {
+        local_untreated_source_points(mpi_tmp_in_indices(i)) =
+            mpi_tmp_points(i);
+      });
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
@@ -135,9 +253,9 @@ int main(int argc, char *argv[])
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
           tr_source_points(i, j - offsets(i)) = ArborX::Point{
-              source_points(indices(j))[0] - target_points(i)[0],
-              source_points(indices(j))[1] - target_points(i)[1],
-              source_points(indices(j))[2] - target_points(i)[2],
+              local_untreated_source_points(j)[0] - target_points(i)[0],
+              local_untreated_source_points(j)[1] - target_points(i)[1],
+              local_untreated_source_points(j)[2] - target_points(i)[2],
           };
         }
       });
@@ -383,14 +501,40 @@ int main(int argc, char *argv[])
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
-                                                   source_points_num);
+                                                   local_source_points_num);
   Kokkos::parallel_for(
       "Example::source_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, source_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_source_points_num),
       KOKKOS_LAMBDA(int const i) {
         source_values(i) = manufactured_solution(source_points(i));
       });
 
+  // To approximate the function, we have to gather the correct source values
+  // We have to redo part of the earlier passes
+  Kokkos::View<float *, MemorySpace> mpi_mid_values("Example::mpi_mid_values",
+                                                    local_requests_num);
+  Kokkos::parallel_for(
+      "Example::mpi_mid_values_fill",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_requests_num),
+      KOKKOS_LAMBDA(int const i) {
+        mpi_mid_values(i) = source_values(mpi_mid_indices(i));
+      });
+
+  Kokkos::View<float *, MemorySpace> local_untreated_source_values(
+      "Example::local_untreated_source_values",
+      target_points_num * num_neighbors);
+  Kokkos::View<float *, MemorySpace> mpi_tmp_values("Examples::mpi_tmp_values",
+                                                    local_responses_num);
+  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
+      space, distributor_second, mpi_mid_values, mpi_tmp_values);
+  Kokkos::parallel_for(
+      "Example::local_untreated_source_values_fill",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_responses_num),
+      KOKKOS_LAMBDA(int const i) {
+        local_untreated_source_values(mpi_tmp_in_indices(i)) =
+            mpi_tmp_values(i);
+      });
+
   // Compute target values via interpolation
   Kokkos::View<float *, MemorySpace> target_values("Example::target_values",
                                                    target_points_num);
@@ -401,7 +545,7 @@ int main(int argc, char *argv[])
         float tmp = 0;
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
-          tmp += coeffs(i, j - offsets(i)) * source_values(indices(j));
+          tmp += coeffs(i, j - offsets(i)) * local_untreated_source_values(j);
         }
         target_values(i) = tmp;
       });
@@ -423,15 +567,23 @@ int main(int argc, char *argv[])
       Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(space, target_values_exact_host, target_values_exact);
 
+  std::stringstream ss{};
   float error = 0.f;
   for (int i = 0; i < target_points_num; i++)
   {
     error = Kokkos::max(
         Kokkos::abs(target_values_host(i) - target_values_exact_host(i)),
         error);
-    std::cout << "==== Target " << i << '\n'
-              << "Interpolation: " << target_values_host(i) << '\n'
-              << "Real value   : " << target_values_exact_host(i) << '\n';
+    ss << mpi_rank << ": ==== Target " << i << '\n'
+              << mpi_rank << ": Interpolation: " << target_values_host(i)
+              << '\n'
+              << mpi_rank << ": Real value   : " << target_values_exact_host(i)
+              << '\n';
   }
-  std::cout << "====\nMaximum error: " << error << std::endl;
+  ss << mpi_rank << ": ====\n"
+            << mpi_rank << ": Maximum error: " << error << std::endl;
+
+  std::cout << ss.str();
+  MPI_Finalize();
+  return 0;
 }

From d0932edcc44f93f805c213059cda66afd9fdc54c Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 8 Aug 2023 16:31:53 -0400
Subject: [PATCH 17/44] Relative error and misc fixes

---
 .../moving_least_squares.cpp                  | 46 ++++++-------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 72a046320..58cf30b91 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -73,23 +73,6 @@ struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
   using memory_space = MemorySpace;
 };
 
-/*
-0: ==== Target 0
-0: Interpolation: 0.717408
-0: Real value   : 0.841471
-0: ==== Target 1
-0: Interpolation: 0.210617
-0: Real value   : 0.205379
-0: ==== Target 2
-0: Interpolation: 7.36529
-0: Real value   : 7.87677
-0: ==== Target 3
-0: Interpolation: 1.41947
-0: Real value   : 2.5903
-0: ====
-0: Maximum error: 1.17083
-*/
-
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
@@ -124,16 +107,17 @@ int main(int argc, char *argv[])
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
   // Generate source points (Organized within a [-10, 10]^3 cube)
+  std::size_t thickness = cube_side / mpi_size;
   Kokkos::parallel_for(
       "Example::source_points_init",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
-          space, {0, 0, 0}, {cube_side, cube_side, cube_side / mpi_size}),
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                             {cube_side, cube_side, thickness}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
-        source_points(i * cube_side * cube_side + j * cube_side + k) =
-            ArborX::Point{20.f * (float(i) / (cube_side - 1) - .5f),
-                          20.f * (float(j) / (cube_side - 1) - .5f),
-                          20.f * (float(k) / (cube_side - 1) - .5f +
-                                  (float(mpi_rank) / mpi_size))};
+        source_points(i * cube_side * cube_side + j * cube_side +
+                      k) = ArborX::Point{
+            20.f * (float(i) / (cube_side - 1) - .5f),
+            20.f * (float(j) / (cube_side - 1) - .5f),
+            20.f * (float(k + thickness * mpi_rank) / (cube_side - 1) - .5f)};
       });
 
   // Generate target points
@@ -178,7 +162,7 @@ int main(int argc, char *argv[])
       distributor_first.createFromSends(space, local_ranks);
 
   // "Middlemen" buffers
-  // - mpi_mid_in_indices(i) corresponds to an index that zill be used to
+  // - mpi_mid_in_indices(i) corresponds to an index that will be used to
   // construct the final value
   // - mpi_mid_rank(i) corresponds to the request origin for value (i)
   // - mpi_mid_indices(i) corresponds to the point's index in the nn query
@@ -572,16 +556,16 @@ int main(int argc, char *argv[])
   for (int i = 0; i < target_points_num; i++)
   {
     error = Kokkos::max(
-        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)),
+        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
+            Kokkos::abs(target_values_exact_host(i)),
         error);
     ss << mpi_rank << ": ==== Target " << i << '\n'
-              << mpi_rank << ": Interpolation: " << target_values_host(i)
-              << '\n'
-              << mpi_rank << ": Real value   : " << target_values_exact_host(i)
-              << '\n';
+       << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
+       << mpi_rank << ": Real value   : " << target_values_exact_host(i)
+       << '\n';
   }
   ss << mpi_rank << ": ====\n"
-            << mpi_rank << ": Maximum error: " << error << std::endl;
+     << mpi_rank << ": Maximum relative error: " << error << std::endl;
 
   std::cout << ss.str();
   MPI_Finalize();

From 67db96a86a1ddcbbb43fe0716332f8812edcaaf4 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 9 Aug 2023 15:34:25 -0400
Subject: [PATCH 18/44] Separation and templation of SVD inverse

---
 .../moving_least_squares.cpp                  | 185 ++------------
 .../symmetric_pseudoinverse_svd.hpp           | 229 ++++++++++++++++++
 2 files changed, 252 insertions(+), 162 deletions(-)
 create mode 100644 examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 58cf30b91..36b069621 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -22,6 +22,7 @@
 #include <limits>
 #include <sstream>
 
+#include "symmetric_pseudoinverse_svd.hpp"
 #include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
@@ -316,154 +317,10 @@ int main(int argc, char *argv[])
         a(i, j, k) = tmp;
       });
 
-  // Pseudo-inverse moment matrix using SVD
-  // We must find U, E (diagonal and positive) and V such that A = U.E.V^T
-  // We also know that A is symmetric (by construction), so U = SV where S is
-  // a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
-  // Thus A = U.E.S.U^T
-  static constexpr float pi_4 = M_PI_4;
-  Kokkos::View<float ***, MemorySpace> a_inv(
-      "Example::A_inv", target_points_num, MVPolynomialBasis_3D::size,
-      MVPolynomialBasis_3D::size);
-  Kokkos::View<float ***, MemorySpace> svd_u(
-      "Example::SVD::U", target_points_num, MVPolynomialBasis_3D::size,
-      MVPolynomialBasis_3D::size);
-  Kokkos::View<float ***, MemorySpace> svd_es(
-      "Example::SVD::E.S", target_points_num, MVPolynomialBasis_3D::size,
-      MVPolynomialBasis_3D::size);
-  Kokkos::deep_copy(space, svd_es, a);
-  Kokkos::parallel_for(
-      "Example::A_inv_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-        {
-          for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
-          {
-            svd_u(i, j, k) = (j == k) * 1.f;
-          }
-        }
-
-        // This finds the biggest off-diagonal value of E.S as well as its
-        // coordinates. Being symmetric, we can always check on the upper
-        // triangle (and always have q > p)
-        auto argmax = [=](int &p, int &q) {
-          float max = 0.f;
-          p = -1;
-          q = -1;
-          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-          {
-            for (int k = j + 1; k < MVPolynomialBasis_3D::size; k++)
-            {
-              float val = Kokkos::abs(svd_es(i, j, k));
-              if (max < val)
-              {
-                max = val;
-                p = j;
-                q = k;
-              }
-            }
-          }
-
-          return max;
-        };
-
-        // Iterative approach, we will "deconstruct" E.S until only the diagonal
-        // is relevent inside the matrix
-        // It is possible to prove that, at each step, the "norm" of the matrix
-        // is strictly less that of the previous
-        int p, q;
-        float norm = argmax(p, q);
-        while (norm > epsilon)
-        {
-          // Our submatrix is now
-          // +----------+----------+   +---+---+
-          // | es(p, p) | es(p, q) |   | a | b |
-          // +----------+----------+ = +---+---+
-          // | es(q, p) | es(q, q) |   | b | c |
-          // +----------+----------+   +---+---+
-          float a = svd_es(i, p, p);
-          float b = svd_es(i, p, q);
-          float c = svd_es(i, q, q);
-
-          float theta, u, v;
-          if (a == c)
-          {
-            theta = pi_4;
-            u = a + b;
-            v = a - b;
-          }
-          else
-          {
-            theta = .5f * Kokkos::atanf((2.f * b) / (a - c));
-            float cos2 = Kokkos::cosf(2.f * theta);
-            u = .5f * (a + c + (a - c) / cos2);
-            v = .5f * (a + c - (a - c) / cos2);
-          }
-          float cos = Kokkos::cosf(theta);
-          float sin = Kokkos::sinf(theta);
-
-          // We must now apply the rotation matrix to the left
-          // and right of E.S and on the right of U
-
-          // Left of E.S (mult by R(theta)^T)
-          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-          {
-            float es_ipj = svd_es(i, p, j);
-            float es_iqj = svd_es(i, q, j);
-            svd_es(i, p, j) = cos * es_ipj + sin * es_iqj;
-            svd_es(i, q, j) = -sin * es_ipj + cos * es_iqj;
-          }
-
-          // Right of E.S (mult by R(theta))
-          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-          {
-            float es_ijp = svd_es(i, j, p);
-            float es_ijq = svd_es(i, j, q);
-            svd_es(i, j, p) = cos * es_ijp + sin * es_ijq;
-            svd_es(i, j, q) = -sin * es_ijp + cos * es_ijq;
-          }
-
-          // Right of U (mult by R(theta))
-          for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-          {
-            float u_ijp = svd_u(i, j, p);
-            float u_ijq = svd_u(i, j, q);
-            svd_u(i, j, p) = cos * u_ijp + sin * u_ijq;
-            svd_u(i, j, q) = -sin * u_ijp + cos * u_ijq;
-          }
-
-          // These should theorically hold but is it ok to force them to their
-          // real value?
-          svd_es(i, p, p) = u;
-          svd_es(i, q, q) = v;
-          svd_es(i, p, q) = 0.f;
-          svd_es(i, q, p) = 0.f;
-
-          norm = argmax(p, q);
-        }
-
-        // We should now have a correct U and E.S
-        // We'll compute the pseudo inverse of A by taking the
-        // pseudo inverse of E.S which is simply inverting the diagonal of
-        // E.S. We have pseudoA = U^T.pseudoES.U
-        for (int j = 0; j < MVPolynomialBasis_3D::size; j++)
-        {
-          for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
-          {
-            float value = 0.f;
-            for (int l = 0; l < MVPolynomialBasis_3D::size; l++)
-            {
-              if (Kokkos::abs(svd_es(i, l, l)) >= epsilon)
-              {
-                value += svd_u(i, j, l) * svd_u(i, k, l) / svd_es(i, l, l);
-              }
-            }
-
-            a_inv(i, j, k) = value;
-          }
-        }
-      });
+  // Compute the pseudo inverse
+  auto a_inv =
+      SymmPseudoInverseSVD<float, ExecutionSpace,
+                           MemorySpace>::compute_pseudo_inverses(space, a);
 
   // Compute the coefficients
   Kokkos::View<float **, MemorySpace> coeffs("Example::coefficients",
@@ -551,23 +408,27 @@ int main(int argc, char *argv[])
       Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(space, target_values_exact_host, target_values_exact);
 
-  std::stringstream ss{};
-  float error = 0.f;
-  for (int i = 0; i < target_points_num; i++)
+  if (mpi_rank == 0)
   {
-    error = Kokkos::max(
-        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
-            Kokkos::abs(target_values_exact_host(i)),
-        error);
-    ss << mpi_rank << ": ==== Target " << i << '\n'
-       << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
-       << mpi_rank << ": Real value   : " << target_values_exact_host(i)
-       << '\n';
+    std::stringstream ss{};
+    float error = 0.f;
+    for (int i = 0; i < target_points_num; i++)
+    {
+      error = Kokkos::max(
+          Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
+              Kokkos::abs(target_values_exact_host(i)),
+          error);
+      ss << mpi_rank << ": ==== Target " << i << '\n'
+         << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
+         << mpi_rank << ": Real value   : " << target_values_exact_host(i)
+         << '\n';
+    }
+    ss << mpi_rank << ": ====\n"
+       << mpi_rank << ": Maximum relative error: " << error << std::endl;
+
+    std::cout << ss.str();
   }
-  ss << mpi_rank << ": ====\n"
-     << mpi_rank << ": Maximum relative error: " << error << std::endl;
 
-  std::cout << ss.str();
   MPI_Finalize();
   return 0;
 }
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
new file mode 100644
index 000000000..57f33c665
--- /dev/null
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -0,0 +1,229 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <limits>
+
+// Pseudo-inverse moment matrix using SVD
+// We must find U, E (diagonal and positive) and V such that A = U.E.V^T
+// We also know that A is symmetric (by construction), so U = SV where S is
+// a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
+// Thus A = U.E.S.U^T
+template <class ValueType, typename ExecutionSpace, typename MemorySpace>
+class SymmPseudoInverseSVD
+{
+public:
+  static Kokkos::View<ValueType ***, MemorySpace>
+  compute_pseudo_inverses(ExecutionSpace const &space,
+                          Kokkos::View<ValueType ***, MemorySpace> const &mats)
+  {
+    SymmPseudoInverseSVD spis(space, mats);
+
+    // Iterative approach, we will "deconstruct" E.S until only the diagonal
+    // is relevent inside the matrix
+    // It is possible to prove that, at each step, the "norm" of the matrix
+    // is strictly less that of the previous
+    Kokkos::parallel_for(
+        "Example::SVD::compute_U_ES",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, spis._num_matrices),
+        KOKKOS_LAMBDA(std::size_t i) {
+          std::size_t p, q;
+          ValueType norm = spis.argmax_off_diagonal(i, p, q);
+          while (norm > spis._epsilon)
+          {
+            spis.compute_u_es_single(i, p, q);
+            norm = spis.argmax_off_diagonal(i, p, q);
+          }
+        });
+
+    // From the SVD results, the pseudo inverse would be
+    // U . [ E^-1.S ] . U^T
+    Kokkos::parallel_for(
+        "Example::SVD::fill_inv",
+        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
+            space, {0, 0, 0}, {spis._num_matrices, spis._size, spis._size}),
+        KOKKOS_LAMBDA(std::size_t i, std::size_t j, std::size_t k) {
+          spis.fill_inv(i, j, k);
+        });
+
+    return spis._inv;
+  }
+
+private:
+  // U and E.S are computed, we can now build the inverse
+  // U . [ E^-1.S ] . U^T
+  KOKKOS_FUNCTION void fill_inv(std::size_t i, std::size_t j, std::size_t k) const
+  {
+    ValueType value = _zero;
+    for (std::size_t l = 0; l < _size; l++)
+    {
+      ValueType v = _es(i, l, l);
+      if (Kokkos::abs(v) > _epsilon)
+      {
+        value += _u(i, j, l) * _u(i, k, l) / v;
+      }
+    }
+
+    _inv(i, j, k) = value;
+  }
+
+  // We found the biggest value in our off-diagonal. We will remove it by
+  // computing a "local" svd and update U and E.S
+  KOKKOS_FUNCTION void compute_u_es_single(std::size_t i, std::size_t p,
+                                           std::size_t q) const
+  {
+    ValueType a = _es(i, p, p);
+    ValueType b = _es(i, p, q);
+    ValueType c = _es(i, q, q);
+
+    // Our submatrix is now
+    // +----------+----------+   +---+---+
+    // | es(p, p) | es(p, q) |   | a | b |
+    // +----------+----------+ = +---+---+
+    // | es(q, p) | es(q, q) |   | b | c |
+    // +----------+----------+   +---+---+
+
+    // Lets compute u, v and theta such that
+    // +---+---+              +---+---+
+    // | a | b |              | u | 0 |
+    // +---+---+ = R(theta) * +---+---+ * R(theta)^T
+    // | b | c |              | 0 | v |
+    // +---+---+              +---+---+
+
+    ValueType theta, u, v;
+    if (a == c) // <-- better to check if |a - c| < epsilon?
+    {
+      theta = _pi_4;
+      u = a + b;
+      v = a - b;
+    }
+    else
+    {
+      theta = _half * Kokkos::atan((_two * b) / (a - c));
+      ValueType a_c_cos2 = (a - c) / Kokkos::cos(_two * theta);
+      u = _half * (a + c + a_c_cos2);
+      v = _half * (a + c - a_c_cos2);
+    }
+    ValueType cos = Kokkos::cos(theta);
+    ValueType sin = Kokkos::sin(theta);
+
+    // Now lets compute the following new values for U amd E.S
+    // E.S <- R'(theta)^T . E.S . R'(theta)
+    // U  <- U . R'(theta)
+
+    // R'(theta)^T . E.S
+    for (std::size_t j = 0; j < _size; j++)
+    {
+      float es_ipj = _es(i, p, j);
+      float es_iqj = _es(i, q, j);
+      _es(i, p, j) = cos * es_ipj + sin * es_iqj;
+      _es(i, q, j) = -sin * es_ipj + cos * es_iqj;
+    }
+
+    // [R'(theta)^T . E.S] . R'(theta)
+    for (std::size_t j = 0; j < _size; j++)
+    {
+      float es_ijp = _es(i, j, p);
+      float es_ijq = _es(i, j, q);
+      _es(i, j, p) = cos * es_ijp + sin * es_ijq;
+      _es(i, j, q) = -sin * es_ijp + cos * es_ijq;
+    }
+
+    // U . R'(theta)
+    for (std::size_t j = 0; j < _size; j++)
+    {
+      float u_ijp = _u(i, j, p);
+      float u_ijq = _u(i, j, q);
+      _u(i, j, p) = cos * u_ijp + sin * u_ijq;
+      _u(i, j, q) = -sin * u_ijp + cos * u_ijq;
+    }
+
+    // These should theorically hold but is it ok to force them to their
+    // real value?
+    _es(i, p, p) = u;
+    _es(i, q, q) = v;
+    _es(i, p, q) = _zero;
+    _es(i, q, p) = _zero;
+  }
+
+  // This finds the biggest off-diagonal value of E.S as well as its
+  // coordinates. Being symmetric, we can always check on the upper
+  // triangle (and always have q > p)
+  KOKKOS_FUNCTION ValueType argmax_off_diagonal(std::size_t i, std::size_t &p,
+                                                std::size_t &q) const
+  {
+    ValueType max = _zero;
+    p = q = 0;
+    for (std::size_t j = 0; j < _size; j++)
+    {
+      for (std::size_t k = j + 1; k < _size; k++)
+      {
+        ValueType val = Kokkos::abs(_es(i, j, k));
+        if (max < val)
+        {
+          max = val;
+          p = j;
+          q = k;
+        }
+      }
+    }
+
+    return max;
+  }
+
+  KOKKOS_FUNCTION
+  SymmPseudoInverseSVD(ExecutionSpace const &space,
+                       Kokkos::View<ValueType ***, MemorySpace> const &mats)
+      : _num_matrices(mats.extent(0))
+      , _size(mats.extent(1))
+  {
+    // mats must be an array of (symmetric) square matrices
+    assert(mats.extent(1) == mats.extent(2));
+
+    _es = Kokkos::View<ValueType ***, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::ES"),
+        mats.layout());
+    Kokkos::deep_copy(space, _es, mats);
+
+    _u = Kokkos::View<ValueType ***, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::U"),
+        mats.layout());
+    Kokkos::parallel_for(
+        "Example::SVD::U_init",
+        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                               {_num_matrices, _size, _size}),
+        KOKKOS_LAMBDA(std::size_t i, std::size_t j, std::size_t k) {
+          _u(i, j, k) = ValueType((j == k));
+        });
+
+    _inv = Kokkos::View<ValueType ***, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::inv"),
+        mats.layout());
+  }
+
+  Kokkos::View<ValueType ***, MemorySpace> _es;
+  Kokkos::View<ValueType ***, MemorySpace> _u;
+  Kokkos::View<ValueType ***, MemorySpace> _inv;
+  std::size_t _num_matrices;
+  std::size_t _size;
+
+  static constexpr ValueType _pi_4 = ValueType(M_PI_4);
+  static constexpr ValueType _epsilon =
+      std::numeric_limits<ValueType>::epsilon();
+  static constexpr ValueType _half = ValueType(0.5);
+  static constexpr ValueType _two = ValueType(2);
+  static constexpr ValueType _zero = ValueType(0);
+};
\ No newline at end of file

From fdf04435b567ab113cce79ce30b93571378b8ae3 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 9 Aug 2023 15:55:19 -0400
Subject: [PATCH 19/44] MPI fixed

---
 examples/moving_least_squares/moving_least_squares.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 36b069621..d9ecfd1bd 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -114,7 +114,7 @@ int main(int argc, char *argv[])
       Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
                                              {cube_side, cube_side, thickness}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
-        source_points(i * cube_side * cube_side + j * cube_side +
+        source_points(i * cube_side * thickness + j * thickness +
                       k) = ArborX::Point{
             20.f * (float(i) / (cube_side - 1) - .5f),
             20.f * (float(j) / (cube_side - 1) - .5f),

From 7ccec9e1cb2690182ad8a0773b7982f4ada9fd5c Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 9 Aug 2023 16:17:04 -0400
Subject: [PATCH 20/44] clang format

---
 .../moving_least_squares.cpp                  | 33 +++++++++----------
 .../symmetric_pseudoinverse_svd.hpp           |  3 +-
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index d9ecfd1bd..b036fcfce 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -408,26 +408,23 @@ int main(int argc, char *argv[])
       Kokkos::create_mirror_view(target_values_exact);
   Kokkos::deep_copy(space, target_values_exact_host, target_values_exact);
 
-  if (mpi_rank == 0)
+  std::stringstream ss{};
+  float error = 0.f;
+  for (int i = 0; i < target_points_num; i++)
   {
-    std::stringstream ss{};
-    float error = 0.f;
-    for (int i = 0; i < target_points_num; i++)
-    {
-      error = Kokkos::max(
-          Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
-              Kokkos::abs(target_values_exact_host(i)),
-          error);
-      ss << mpi_rank << ": ==== Target " << i << '\n'
-         << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
-         << mpi_rank << ": Real value   : " << target_values_exact_host(i)
-         << '\n';
-    }
-    ss << mpi_rank << ": ====\n"
-       << mpi_rank << ": Maximum relative error: " << error << std::endl;
-
-    std::cout << ss.str();
+    error = Kokkos::max(
+        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
+            Kokkos::abs(target_values_exact_host(i)),
+        error);
+    /*
+    ss << mpi_rank << ": ==== Target " << i << '\n'
+        << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
+        << mpi_rank << ": Real value   : " << target_values_exact_host(i)
+        << '\n'; */
   }
+  ss << mpi_rank << ": Maximum relative error: " << error << std::endl;
+
+  std::cout << ss.str();
 
   MPI_Finalize();
   return 0;
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
index 57f33c665..62d7a08f7 100644
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -65,7 +65,8 @@ class SymmPseudoInverseSVD
 private:
   // U and E.S are computed, we can now build the inverse
   // U . [ E^-1.S ] . U^T
-  KOKKOS_FUNCTION void fill_inv(std::size_t i, std::size_t j, std::size_t k) const
+  KOKKOS_FUNCTION void fill_inv(std::size_t i, std::size_t j,
+                                std::size_t k) const
   {
     ValueType value = _zero;
     for (std::size_t l = 0; l < _size; l++)

From df15ad8c03025331dcbdc17de340cd74e3cdc2ca Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 10 Aug 2023 10:36:53 -0400
Subject: [PATCH 21/44] Templation of MPI communication

---
 .../moving_least_squares.cpp                  | 120 ++-------------
 examples/moving_least_squares/mpi_comms.hpp   | 145 ++++++++++++++++++
 2 files changed, 158 insertions(+), 107 deletions(-)
 create mode 100644 examples/moving_least_squares/mpi_comms.hpp

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index b036fcfce..dc5f2459c 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -22,6 +22,7 @@
 #include <limits>
 #include <sstream>
 
+#include "mpi_comms.hpp"
 #include "symmetric_pseudoinverse_svd.hpp"
 #include <mpi.h>
 
@@ -151,80 +152,9 @@ int main(int argc, char *argv[])
         local_ranks(i) = index_ranks(i).second;
       });
 
-  // Before moving on, we must gather the coordinates of all the requested
-  // source points. DTK does that by distributing in a "who wants what" matter
-  // The distribution is done in two phases. A first pass where every process
-  // receives the information on "who wants what" from them. Then a second pass
-  // is done where values are set up and sent back to processes
-
-  // First pass setup
-  ArborX::Details::Distributor<DeviceSpace> distributor_first(mpi_comm);
-  int const local_requests_num =
-      distributor_first.createFromSends(space, local_ranks);
-
-  // "Middlemen" buffers
-  // - mpi_mid_in_indices(i) corresponds to an index that will be used to
-  // construct the final value
-  // - mpi_mid_rank(i) corresponds to the request origin for value (i)
-  // - mpi_mid_indices(i) corresponds to the point's index in the nn query
-  // from which mpi_mid_points(i) is attached to
-  Kokkos::View<int *, MemorySpace> mpi_mid_in_indices(
-      "Example::mpi_mid_in_indices", local_requests_num);
-  Kokkos::View<int *, MemorySpace> mpi_mid_indices("Example::mpi_mid_indices",
-                                                   local_requests_num);
-  Kokkos::View<int *, MemorySpace> mpi_mid_ranks("Example::mpi_mid_ranks",
-                                                 local_requests_num);
-  Kokkos::View<ArborX::Point *, MemorySpace> mpi_mid_points(
-      "Example::mpi_mid_points", local_requests_num);
-
-  // First pass comms
-  Kokkos::View<int *, MemorySpace> mpi_tmp("Example::mpi_tmp",
-                                           target_points_num * num_neighbors);
-  ArborX::iota(space, mpi_tmp);
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_first, mpi_tmp, mpi_mid_in_indices);
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_first, local_indices, mpi_mid_indices);
-  Kokkos::deep_copy(space, mpi_tmp, mpi_rank);
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_first, mpi_tmp, mpi_mid_ranks);
-  Kokkos::parallel_for(
-      "Example::mpi_mid_points_fill",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_requests_num),
-      KOKKOS_LAMBDA(int const i) {
-        mpi_mid_points(i) = source_points(mpi_mid_indices(i));
-      });
-
-  // This process now knows "who wants what" and is ready to send everything
-  // back
-
-  // Second pass setup
-  ArborX::Details::Distributor<DeviceSpace> distributor_second(mpi_comm);
-  int const local_responses_num =
-      distributor_second.createFromSends(space, mpi_mid_ranks);
-  Kokkos::View<ArborX::Point *, MemorySpace> local_untreated_source_points(
-      "Example::local_untreated_source_points",
-      target_points_num * num_neighbors);
-  // We have local_responses_num == target_points_num * num_neighbors
-
-  // Temporary buffers
-  Kokkos::View<int *, MemorySpace> mpi_tmp_in_indices(
-      "Examples::mpi_tmp_in_indices", local_responses_num);
-  Kokkos::View<ArborX::Point *, MemorySpace> mpi_tmp_points(
-      "Examples::mpi_tmp_points", local_responses_num);
-
-  // Second pass comms
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_second, mpi_mid_points, mpi_tmp_points);
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_second, mpi_mid_in_indices, mpi_tmp_in_indices);
-  Kokkos::parallel_for(
-      "Example::local_untreated_source_points_fill",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_responses_num),
-      KOKKOS_LAMBDA(int const i) {
-        local_untreated_source_points(mpi_tmp_in_indices(i)) =
-            mpi_tmp_points(i);
-      });
+  MPIComms<ExecutionSpace, MemorySpace> comms(space, mpi_comm, local_indices,
+                                              local_ranks);
+  auto local_source_points = comms.distribute(space, source_points);
 
   // Now that we have the neighbors, we recompute their position using
   // their target point as the origin.
@@ -238,9 +168,9 @@ int main(int argc, char *argv[])
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
           tr_source_points(i, j - offsets(i)) = ArborX::Point{
-              local_untreated_source_points(j)[0] - target_points(i)[0],
-              local_untreated_source_points(j)[1] - target_points(i)[1],
-              local_untreated_source_points(j)[2] - target_points(i)[2],
+              local_source_points(j)[0] - target_points(i)[0],
+              local_source_points(j)[1] - target_points(i)[1],
+              local_source_points(j)[2] - target_points(i)[2],
           };
         }
       });
@@ -350,31 +280,7 @@ int main(int argc, char *argv[])
         source_values(i) = manufactured_solution(source_points(i));
       });
 
-  // To approximate the function, we have to gather the correct source values
-  // We have to redo part of the earlier passes
-  Kokkos::View<float *, MemorySpace> mpi_mid_values("Example::mpi_mid_values",
-                                                    local_requests_num);
-  Kokkos::parallel_for(
-      "Example::mpi_mid_values_fill",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_requests_num),
-      KOKKOS_LAMBDA(int const i) {
-        mpi_mid_values(i) = source_values(mpi_mid_indices(i));
-      });
-
-  Kokkos::View<float *, MemorySpace> local_untreated_source_values(
-      "Example::local_untreated_source_values",
-      target_points_num * num_neighbors);
-  Kokkos::View<float *, MemorySpace> mpi_tmp_values("Examples::mpi_tmp_values",
-                                                    local_responses_num);
-  ArborX::Details::DistributedTreeImpl<DeviceSpace>::sendAcrossNetwork(
-      space, distributor_second, mpi_mid_values, mpi_tmp_values);
-  Kokkos::parallel_for(
-      "Example::local_untreated_source_values_fill",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_responses_num),
-      KOKKOS_LAMBDA(int const i) {
-        local_untreated_source_values(mpi_tmp_in_indices(i)) =
-            mpi_tmp_values(i);
-      });
+  auto local_source_values = comms.distribute(space, source_values);
 
   // Compute target values via interpolation
   Kokkos::View<float *, MemorySpace> target_values("Example::target_values",
@@ -386,7 +292,7 @@ int main(int argc, char *argv[])
         float tmp = 0;
         for (int j = offsets(i); j < offsets(i + 1); j++)
         {
-          tmp += coeffs(i, j - offsets(i)) * local_untreated_source_values(j);
+          tmp += coeffs(i, j - offsets(i)) * local_source_values(j);
         }
         target_values(i) = tmp;
       });
@@ -416,11 +322,11 @@ int main(int argc, char *argv[])
         Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
             Kokkos::abs(target_values_exact_host(i)),
         error);
-    /*
+
     ss << mpi_rank << ": ==== Target " << i << '\n'
-        << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
-        << mpi_rank << ": Real value   : " << target_values_exact_host(i)
-        << '\n'; */
+       << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
+       << mpi_rank << ": Real value   : " << target_values_exact_host(i)
+       << '\n';
   }
   ss << mpi_rank << ": Maximum relative error: " << error << std::endl;
 
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
new file mode 100644
index 000000000..8fc04ffda
--- /dev/null
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -0,0 +1,145 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+
+#include <mpi.h>
+
+template <typename ExecutionSpace, typename MemorySpace>
+class MPIComms
+{
+public:
+  MPIComms(ExecutionSpace const &space, MPI_Comm comm,
+        Kokkos::View<int *, MemorySpace> indices,
+        Kokkos::View<int *, MemorySpace> ranks)
+      : _distributor_back(comm)
+  {
+    assert(indices.extent(0) == ranks.extent(0));
+    std::size_t data_len = indices.extent(0);
+    int rank;
+    MPI_Comm_rank(comm, &rank);
+
+    Kokkos::View<int *, MemorySpace> mpi_tmp(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MPI::tmp"),
+        data_len);
+
+    // Computes what will be common to every exchange. Every time
+    // someone wants to get the value from the same set of elements,
+    // they will use the same list of recv and send indices.
+    // The rank data will be saved inside the back distributor,
+    // as the front one is not relevant once the recv indices
+    // are computed.
+
+    // This builds for each process a local array indicating how much
+    // informatiom will be gathered
+    ArborX::Details::Distributor<device> distributor_forth(comm);
+    _num_requests = distributor_forth.createFromSends(space, ranks);
+
+    // This creates the temporary buffer that will help when producing the
+    // array that rebuilds the output
+    Kokkos::View<int *, MemorySpace> mpi_rev_indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::rev_indices"),
+        _num_requests);
+    ArborX::iota(space, mpi_tmp);
+    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+        space, distributor_forth, mpi_tmp, mpi_rev_indices);
+
+    // This retrieves which source index a process wants and gives it to
+    // the process owning the source
+    _mpi_send_indices = Kokkos::View<int *, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::send_indices"),
+        _num_requests);
+    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+        space, distributor_forth, indices, _mpi_send_indices);
+
+    // This builds the temporary buffer that will create the reverse
+    // distributor to dispatch the values
+    Kokkos::View<int *, MemorySpace> mpi_rev_ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::rev_ranks"),
+        _num_requests);
+    Kokkos::deep_copy(space, mpi_tmp, rank);
+    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+        space, distributor_forth, mpi_tmp, mpi_rev_ranks);
+
+    // This will create the reverse of the previous distributor
+    _num_responses = _distributor_back.createFromSends(space, mpi_rev_ranks);
+
+    // There should be enough responses to perfectly fill what was requested
+    assert(_num_responses == data_len);
+
+    // The we send back the requested indices so that each process can rebuild
+    // the output
+    _mpi_recv_indices = Kokkos::View<int *, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::recv_indices"),
+        _num_responses);
+    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+        space, _distributor_back, mpi_rev_indices, _mpi_recv_indices);
+  }
+
+  template <typename ValueType>
+  Kokkos::View<ValueType *, MemorySpace>
+  distribute(ExecutionSpace const &space,
+             Kokkos::View<ValueType *, MemorySpace> const &source)
+  {
+    // We know what each process want so we prepare the data to be sent
+    Kokkos::View<ValueType *, MemorySpace> data_to_send(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::data_to_send"),
+        _num_requests);
+    Kokkos::parallel_for(
+        "Example::MPI::data_to_send_fill",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
+        KOKKOS_CLASS_LAMBDA(int const i) {
+          data_to_send(i) = source(_mpi_send_indices(i));
+        });
+
+    // Then we properly send it, and each process has what it wants, but in the
+    // wrong order
+    Kokkos::View<ValueType *, MemorySpace> data_to_recv(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::data_to_recv"),
+        _num_responses);
+    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+        space, _distributor_back, data_to_send, data_to_recv);
+
+    // So we fix this by moving everything
+    Kokkos::View<ValueType *, MemorySpace> output(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MPI::output"),
+        _num_responses);
+    Kokkos::parallel_for(
+        "Example::MPI::output_fill",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_responses),
+        KOKKOS_CLASS_LAMBDA(int const i) {
+          output(_mpi_recv_indices(i)) = data_to_recv(i);
+        });
+
+    return output;
+  }
+
+private:
+  using device = Kokkos::Device<ExecutionSpace, MemorySpace>;
+
+  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
+  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
+  ArborX::Details::Distributor<device> _distributor_back;
+  std::size_t _num_requests;
+  std::size_t _num_responses;
+};
\ No newline at end of file

From 0d3f7239a3cc376923cab1501c8d1908ce7c94c2 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 10 Aug 2023 12:45:40 -0400
Subject: [PATCH 22/44] clang format

---
 examples/moving_least_squares/mpi_comms.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index 8fc04ffda..96aef8c0e 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -24,8 +24,8 @@ class MPIComms
 {
 public:
   MPIComms(ExecutionSpace const &space, MPI_Comm comm,
-        Kokkos::View<int *, MemorySpace> indices,
-        Kokkos::View<int *, MemorySpace> ranks)
+           Kokkos::View<int *, MemorySpace> indices,
+           Kokkos::View<int *, MemorySpace> ranks)
       : _distributor_back(comm)
   {
     assert(indices.extent(0) == ranks.extent(0));

From 4edbe1972ac9c6cf7a2980ebe7f12e74c2634362 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 10 Aug 2023 13:36:40 -0400
Subject: [PATCH 23/44] Switching from std::size_t to int const and removing
 missing floats

---
 .../symmetric_pseudoinverse_svd.hpp           | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
index 62d7a08f7..6be2cf5d7 100644
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -39,8 +39,8 @@ class SymmPseudoInverseSVD
     Kokkos::parallel_for(
         "Example::SVD::compute_U_ES",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, spis._num_matrices),
-        KOKKOS_LAMBDA(std::size_t i) {
-          std::size_t p, q;
+        KOKKOS_LAMBDA(int const i) {
+          int p, q;
           ValueType norm = spis.argmax_off_diagonal(i, p, q);
           while (norm > spis._epsilon)
           {
@@ -55,7 +55,7 @@ class SymmPseudoInverseSVD
         "Example::SVD::fill_inv",
         Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
             space, {0, 0, 0}, {spis._num_matrices, spis._size, spis._size}),
-        KOKKOS_LAMBDA(std::size_t i, std::size_t j, std::size_t k) {
+        KOKKOS_LAMBDA(int const i, int const j, int const k) {
           spis.fill_inv(i, j, k);
         });
 
@@ -65,11 +65,10 @@ class SymmPseudoInverseSVD
 private:
   // U and E.S are computed, we can now build the inverse
   // U . [ E^-1.S ] . U^T
-  KOKKOS_FUNCTION void fill_inv(std::size_t i, std::size_t j,
-                                std::size_t k) const
+  KOKKOS_FUNCTION void fill_inv(int const i, int const j, int const k) const
   {
     ValueType value = _zero;
-    for (std::size_t l = 0; l < _size; l++)
+    for (int l = 0; l < _size; l++)
     {
       ValueType v = _es(i, l, l);
       if (Kokkos::abs(v) > _epsilon)
@@ -83,8 +82,8 @@ class SymmPseudoInverseSVD
 
   // We found the biggest value in our off-diagonal. We will remove it by
   // computing a "local" svd and update U and E.S
-  KOKKOS_FUNCTION void compute_u_es_single(std::size_t i, std::size_t p,
-                                           std::size_t q) const
+  KOKKOS_FUNCTION void compute_u_es_single(int const i, int const p,
+                                           int const q) const
   {
     ValueType a = _es(i, p, p);
     ValueType b = _es(i, p, q);
@@ -126,28 +125,28 @@ class SymmPseudoInverseSVD
     // U  <- U . R'(theta)
 
     // R'(theta)^T . E.S
-    for (std::size_t j = 0; j < _size; j++)
+    for (int j = 0; j < _size; j++)
     {
-      float es_ipj = _es(i, p, j);
-      float es_iqj = _es(i, q, j);
+      ValueType es_ipj = _es(i, p, j);
+      ValueType es_iqj = _es(i, q, j);
       _es(i, p, j) = cos * es_ipj + sin * es_iqj;
       _es(i, q, j) = -sin * es_ipj + cos * es_iqj;
     }
 
     // [R'(theta)^T . E.S] . R'(theta)
-    for (std::size_t j = 0; j < _size; j++)
+    for (int j = 0; j < _size; j++)
     {
-      float es_ijp = _es(i, j, p);
-      float es_ijq = _es(i, j, q);
+      ValueType es_ijp = _es(i, j, p);
+      ValueType es_ijq = _es(i, j, q);
       _es(i, j, p) = cos * es_ijp + sin * es_ijq;
       _es(i, j, q) = -sin * es_ijp + cos * es_ijq;
     }
 
     // U . R'(theta)
-    for (std::size_t j = 0; j < _size; j++)
+    for (int j = 0; j < _size; j++)
     {
-      float u_ijp = _u(i, j, p);
-      float u_ijq = _u(i, j, q);
+      ValueType u_ijp = _u(i, j, p);
+      ValueType u_ijq = _u(i, j, q);
       _u(i, j, p) = cos * u_ijp + sin * u_ijq;
       _u(i, j, q) = -sin * u_ijp + cos * u_ijq;
     }
@@ -163,14 +162,14 @@ class SymmPseudoInverseSVD
   // This finds the biggest off-diagonal value of E.S as well as its
   // coordinates. Being symmetric, we can always check on the upper
   // triangle (and always have q > p)
-  KOKKOS_FUNCTION ValueType argmax_off_diagonal(std::size_t i, std::size_t &p,
-                                                std::size_t &q) const
+  KOKKOS_FUNCTION ValueType argmax_off_diagonal(int const i, int &p,
+                                                int &q) const
   {
     ValueType max = _zero;
     p = q = 0;
-    for (std::size_t j = 0; j < _size; j++)
+    for (int j = 0; j < _size; j++)
     {
-      for (std::size_t k = j + 1; k < _size; k++)
+      for (int k = j + 1; k < _size; k++)
       {
         ValueType val = Kokkos::abs(_es(i, j, k));
         if (max < val)
@@ -206,7 +205,7 @@ class SymmPseudoInverseSVD
         "Example::SVD::U_init",
         Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
                                                {_num_matrices, _size, _size}),
-        KOKKOS_LAMBDA(std::size_t i, std::size_t j, std::size_t k) {
+        KOKKOS_LAMBDA(int const i, int const j, int const k) {
           _u(i, j, k) = ValueType((j == k));
         });
 

From 3a9afcf570202d8f272d8252ba812ebc6343752e Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 10 Aug 2023 14:37:59 -0400
Subject: [PATCH 24/44] Templation of the proper MLS computation

---
 .../moving_least_squares/mls_computation.hpp  | 233 ++++++++++++++++++
 .../moving_least_squares.cpp                  | 137 +---------
 2 files changed, 240 insertions(+), 130 deletions(-)
 create mode 100644 examples/moving_least_squares/mls_computation.hpp

diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
new file mode 100644
index 000000000..02c6c580a
--- /dev/null
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -0,0 +1,233 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+
+#include "symmetric_pseudoinverse_svd.hpp"
+
+template <typename ValueType, typename PolynomialBasis, typename RBF,
+          typename ExecutionSpace, typename MemorySpace>
+class MLSComputation
+{
+public:
+  MLSComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+  {
+    // There must be a list of num_neighbors source points for each
+    // target point
+    _num_neighbors = source_points.extent(0) / target_points.extent(0);
+    assert(source_points.extent(0) == target_points.extent(0) * _num_neighbors);
+    _num_targets = target_points.extent(0);
+
+    auto source_ref_target =
+        translate_to_target(space, source_points, target_points);
+
+    auto radii = compute_radii(space, source_ref_target);
+    auto phi = compute_weight(space, source_ref_target, radii);
+    auto p = compute_vandermonde(space, source_ref_target);
+
+    auto a = compute_moment(space, phi, p);
+    auto a_inv =
+        SymmPseudoInverseSVD<ValueType, ExecutionSpace,
+                             MemorySpace>::compute_pseudo_inverses(space, a);
+
+    compute_coefficients(space, phi, p, a_inv);
+  }
+
+  Kokkos::View<ValueType *>
+  eval(ExecutionSpace const &space,
+       Kokkos::View<ValueType *, MemorySpace> const &source_values)
+  {
+    Kokkos::View<ValueType *, MemorySpace> target_values(
+        "Example::MLS::target_values", _num_targets);
+    Kokkos::parallel_for(
+        "Example::MLS::target_interpolation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          ValueType tmp = _zero;
+          for (int j = 0; j < _num_neighbors; j++)
+          {
+            tmp += _coeffs(i, j) * source_values(i * _num_neighbors + j);
+          }
+          target_values(i) = tmp;
+        });
+
+    return target_values;
+  }
+
+private:
+  Kokkos::View<ArborX::Point **, MemorySpace> translate_to_target(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+  {
+    // We center each group around the target as it ables you to
+    // optimize the final computation
+    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLS::source_ref_target"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLS::source_ref_target_fill",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          ArborX::Point src = source_points(i * _num_neighbors + j);
+          ArborX::Point tgt = target_points(i);
+          source_ref_target(i, j) = ArborX::Point{
+              src[0] - tgt[0],
+              src[1] - tgt[1],
+              src[2] - tgt[2],
+          };
+        });
+
+    return source_ref_target;
+  }
+
+  Kokkos::View<ValueType *, MemorySpace> compute_radii(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
+  {
+    Kokkos::View<ValueType *, MemorySpace> radii(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::radii"),
+        _num_targets);
+    Kokkos::parallel_for(
+        "Example::MLS::radii_computation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          ValueType radius = _ten * _epsilon;
+          for (int j = 0; j < _num_neighbors; j++)
+          {
+            ValueType norm =
+                ArborX::Details::distance(source_ref_target(i, j), _origin);
+            radius = (radius < norm) ? norm : radius;
+          }
+          radii(i) = _one_extra * radius;
+        });
+
+    return radii;
+  }
+
+  Kokkos::View<ValueType **, MemorySpace> compute_weight(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      Kokkos::View<ValueType *, MemorySpace> const &radii)
+  {
+    Kokkos::View<ValueType **, MemorySpace> phi(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::phi"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLS::phi_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          RBF rbf{radii(i)};
+          ValueType norm =
+              ArborX::Details::distance(source_ref_target(i, j), _origin);
+          phi(i, j) = rbf(norm);
+        });
+
+    return phi;
+  }
+
+  Kokkos::View<ValueType ***, MemorySpace> compute_vandermonde(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
+  {
+    // Instead of relying on an external type, could it be produced
+    // automatically?
+    Kokkos::View<ValueType ***, MemorySpace> p(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLS::vandermonde"),
+        _num_targets, _num_neighbors, PolynomialBasis::size);
+    Kokkos::parallel_for(
+        "Example::MLS::vandermonde_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          auto basis = PolynomialBasis::basis(source_ref_target(i, j));
+          for (int k = 0; k < PolynomialBasis::size; k++)
+          {
+            p(i, j, k) = basis[k];
+          }
+        });
+
+    return p;
+  }
+
+  Kokkos::View<ValueType ***, MemorySpace>
+  compute_moment(ExecutionSpace const &space,
+                 Kokkos::View<ValueType **, MemorySpace> const &phi,
+                 Kokkos::View<ValueType ***, MemorySpace> const &p)
+  {
+    Kokkos::View<ValueType ***, MemorySpace> a(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::moment"),
+        _num_targets, PolynomialBasis::size, PolynomialBasis::size);
+    Kokkos::parallel_for(
+        "Example::MLS::moment_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
+            space, {0, 0, 0},
+            {_num_targets, PolynomialBasis::size, PolynomialBasis::size}),
+        KOKKOS_LAMBDA(int const i, int const j, int const k) {
+          ValueType tmp = _zero;
+          for (int l = 0; l < _num_neighbors; l++)
+          {
+            tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
+          }
+          a(i, j, k) = tmp;
+        });
+
+    return a;
+  }
+
+  void
+  compute_coefficients(ExecutionSpace const &space,
+                       Kokkos::View<ValueType **, MemorySpace> const &phi,
+                       Kokkos::View<ValueType ***, MemorySpace> const &p,
+                       Kokkos::View<ValueType ***, MemorySpace> const &a_inv)
+  {
+    _coeffs = Kokkos::View<ValueType **, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLS::coefficients"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLS::coefficients",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          ValueType tmp = _zero;
+          for (int k = 0; k < PolynomialBasis::size; k++)
+          {
+            tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
+          }
+          _coeffs(i, j) = tmp;
+        });
+  }
+
+  Kokkos::View<ValueType **, MemorySpace> _coeffs;
+  std::size_t _num_targets;
+  std::size_t _num_neighbors;
+
+  static constexpr ValueType _zero = ValueType(0);
+  static constexpr ValueType _ten = ValueType(10);
+  static constexpr ValueType _epsilon =
+      std::numeric_limits<ValueType>::epsilon();
+  static constexpr ValueType _one_extra = ValueType(1.1);
+  static constexpr ArborX::Point _origin = ArborX::Point{0, 0, 0};
+};
\ No newline at end of file
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index dc5f2459c..a9b7ae8f4 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -22,8 +22,8 @@
 #include <limits>
 #include <sstream>
 
+#include "mls_computation.hpp"
 #include "mpi_comms.hpp"
-#include "symmetric_pseudoinverse_svd.hpp"
 #include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
@@ -45,8 +45,8 @@ struct MVPolynomialBasis_3D
 {
   static constexpr std::size_t size = 10;
 
-  KOKKOS_INLINE_FUNCTION Kokkos::Array<float, size>
-  operator()(ArborX::Point const &p) const
+  KOKKOS_INLINE_FUNCTION static Kokkos::Array<float, size>
+  basis(ArborX::Point const &p)
   {
     return {{1.f, p[0], p[1], p[2], p[0] * p[0], p[0] * p[1], p[0] * p[2],
              p[1] * p[1], p[1] * p[2], p[2] * p[2]}};
@@ -156,119 +156,9 @@ int main(int argc, char *argv[])
                                               local_ranks);
   auto local_source_points = comms.distribute(space, source_points);
 
-  // Now that we have the neighbors, we recompute their position using
-  // their target point as the origin.
-  // This is used as an optimisation later in the algorithm
-  Kokkos::View<ArborX::Point **, MemorySpace> tr_source_points(
-      "Example::tr_source_points", target_points_num, num_neighbors);
-  Kokkos::parallel_for(
-      "Example::transform_source_points",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        for (int j = offsets(i); j < offsets(i + 1); j++)
-        {
-          tr_source_points(i, j - offsets(i)) = ArborX::Point{
-              local_source_points(j)[0] - target_points(i)[0],
-              local_source_points(j)[1] - target_points(i)[1],
-              local_source_points(j)[2] - target_points(i)[2],
-          };
-        }
-      });
-
-  // Compute the radii for the weight (phi) vector
-  Kokkos::View<float *, MemorySpace> radii("Example::radii", target_points_num);
-  Kokkos::parallel_for(
-      "Example::radii_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        float radius = 10.f * epsilon;
-
-        for (int j = 0; j < num_neighbors; j++)
-        {
-          float norm = ArborX::Details::distance(tr_source_points(i, j),
-                                                 ArborX::Point{0.f, 0.f, 0.f});
-          radius = (radius < norm) ? norm : radius;
-        }
-
-        radii(i) = 1.1f * radius;
-      });
-
-  // Compute the weight (phi) vector
-  Kokkos::View<float **, MemorySpace> phi("Example::phi", target_points_num,
-                                          num_neighbors);
-  Kokkos::parallel_for(
-      "Example::phi_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        RBFWendland_0 rbf{radii(i)};
-
-        for (int j = 0; j < num_neighbors; j++)
-        {
-          float norm = ArborX::Details::distance(tr_source_points(i, j),
-                                                 ArborX::Point{0.f, 0.f, 0.f});
-          phi(i, j) = rbf(norm);
-        }
-      });
-
-  // Compute multivariable Vandermonde (P) matrix
-  Kokkos::View<float ***, MemorySpace> p("Example::vandermonde",
-                                         target_points_num, num_neighbors,
-                                         MVPolynomialBasis_3D::size);
-  Kokkos::parallel_for(
-      "Example::vandermonde_computation",
-      Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
-          space, {0, 0}, {target_points_num, num_neighbors}),
-      KOKKOS_LAMBDA(int const i, int const j) {
-        auto basis = MVPolynomialBasis_3D{}(tr_source_points(i, j));
-
-        for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
-        {
-          p(i, j, k) = basis[k];
-        }
-      });
-
-  // Compute moment (A) matrix
-  Kokkos::View<float ***, MemorySpace> a("Example::A", target_points_num,
-                                         MVPolynomialBasis_3D::size,
-                                         MVPolynomialBasis_3D::size);
-  Kokkos::parallel_for(
-      "Example::A_computation",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                             {target_points_num,
-                                              MVPolynomialBasis_3D::size,
-                                              MVPolynomialBasis_3D::size}),
-      KOKKOS_LAMBDA(int const i, int const j, int const k) {
-        float tmp = 0;
-        for (int l = 0; l < num_neighbors; l++)
-        {
-          tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
-        }
-
-        a(i, j, k) = tmp;
-      });
-
-  // Compute the pseudo inverse
-  auto a_inv =
-      SymmPseudoInverseSVD<float, ExecutionSpace,
-                           MemorySpace>::compute_pseudo_inverses(space, a);
-
-  // Compute the coefficients
-  Kokkos::View<float **, MemorySpace> coeffs("Example::coefficients",
-                                             target_points_num, num_neighbors);
-  Kokkos::parallel_for(
-      "Example::coefficients_computation",
-      Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
-          space, {0, 0}, {target_points_num, num_neighbors}),
-      KOKKOS_LAMBDA(int const i, int const j) {
-        float tmp = 0.f;
-
-        for (int k = 0; k < MVPolynomialBasis_3D::size; k++)
-        {
-          tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
-        }
-
-        coeffs(i, j) = tmp;
-      });
+  MLSComputation<float, MVPolynomialBasis_3D, RBFWendland_0, ExecutionSpace,
+                 MemorySpace>
+      mlsc(space, local_source_points, target_points);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
@@ -282,20 +172,7 @@ int main(int argc, char *argv[])
 
   auto local_source_values = comms.distribute(space, source_values);
 
-  // Compute target values via interpolation
-  Kokkos::View<float *, MemorySpace> target_values("Example::target_values",
-                                                   target_points_num);
-  Kokkos::parallel_for(
-      "Example::target_interpolation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        float tmp = 0;
-        for (int j = offsets(i); j < offsets(i + 1); j++)
-        {
-          tmp += coeffs(i, j - offsets(i)) * local_source_values(j);
-        }
-        target_values(i) = tmp;
-      });
+  auto target_values = mlsc.eval(space, local_source_values);
 
   // Compute target values via evaluation
   Kokkos::View<float *, MemorySpace> target_values_exact(

From 68e199ac7ea4f33d701fc0c823a75917dd644776 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 10 Aug 2023 14:41:22 -0400
Subject: [PATCH 25/44] CMake MPI check

---
 examples/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6d486bc72..15b7e5b7f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -10,7 +10,6 @@ add_subdirectory(callback)
 add_subdirectory(dbscan)
 add_subdirectory(molecular_dynamics)
 add_subdirectory(simple_intersection)
-add_subdirectory(moving_least_squares)
 
 find_package(Boost COMPONENTS program_options)
 if(Boost_FOUND)
@@ -18,3 +17,7 @@ if(Boost_FOUND)
   add_subdirectory(raytracing)
   add_subdirectory(brute_force)
 endif()
+
+if(ARBORX_ENABLE_MPI)
+  add_subdirectory(moving_least_squares)
+endif()

From 894e80f68bcb9cc4bec99ef3aea7a39f60809b62 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Mon, 14 Aug 2023 13:01:32 -0400
Subject: [PATCH 26/44] Templated Moving Least Squares

---
 examples/moving_least_squares/mls.hpp         | 115 ++++++++++++++++++
 .../moving_least_squares/mls_computation.hpp  |  43 ++++---
 .../moving_least_squares.cpp                  |  66 +---------
 examples/moving_least_squares/mpi_comms.hpp   |   4 +
 4 files changed, 149 insertions(+), 79 deletions(-)
 create mode 100644 examples/moving_least_squares/mls.hpp

diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
new file mode 100644
index 000000000..1ffb90c90
--- /dev/null
+++ b/examples/moving_least_squares/mls.hpp
@@ -0,0 +1,115 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+
+#include "mls_computation.hpp"
+#include "mpi_comms.hpp"
+
+template <typename MemorySpace>
+struct TargetPoints
+{
+  Kokkos::View<ArborX::Point *, MemorySpace> target_points;
+  std::size_t num_neighbors;
+};
+
+template <typename ValueType, typename PolynomialBasis, typename RBF,
+          typename ExecutionSpace, typename MemorySpace>
+class MLS
+{
+public:
+  MLS(ExecutionSpace const &space, MPI_Comm comm, std::size_t num_neighbors,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+      : _num_neighbors(num_neighbors)
+      , _src_size(source_points.extent(0))
+      , _tgt_size(target_points.extent(0))
+      , _comms(comm)
+  {
+    // There must be enough source points
+    assert(_src_size >= _num_neighbors);
+
+    // Organize source points as tree
+    ArborX::DistributedTree<MemorySpace> source_tree(comm, space,
+                                                     source_points);
+
+    // Perform the query
+    Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
+        "Example::MLS::index_ranks", 0);
+    Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
+    source_tree.query(space,
+                      TargetPoints<MemorySpace>{target_points, _num_neighbors},
+                      index_ranks, offsets);
+
+    // Split indices/ranks
+    Kokkos::View<int *, MemorySpace> local_indices(
+        "Example::MLS::local_indices", _tgt_size * _num_neighbors);
+    Kokkos::View<int *, MemorySpace> local_ranks("Example::MLS::local_ranks",
+                                                 _tgt_size * _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLS::index_ranks_split",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0,
+                                            _tgt_size * _num_neighbors),
+        KOKKOS_LAMBDA(int const i) {
+          local_indices(i) = index_ranks(i).first;
+          local_ranks(i) = index_ranks(i).second;
+        });
+
+    // Set up comms and local source points
+    _comms = MPIComms<ExecutionSpace, MemorySpace>(space, comm, local_indices,
+                                                   local_ranks);
+    auto local_source_points = _comms.distribute(space, source_points);
+
+    // Compute the internal MLS
+    _mlsc =
+        MLSComputation<ValueType, PolynomialBasis, RBF, ExecutionSpace,
+                       MemorySpace>(space, local_source_points, target_points);
+  }
+
+  Kokkos::View<ValueType *, MemorySpace>
+  evaluate(ExecutionSpace const &space,
+           Kokkos::View<ValueType *, MemorySpace> const &source_values)
+  {
+    assert(source_values.extent(0) == _src_size);
+    return _mlsc.evaluate(space, _comms.distribute(space, source_values));
+  }
+
+private:
+  MLSComputation<ValueType, PolynomialBasis, RBF, ExecutionSpace, MemorySpace>
+      _mlsc;
+  MPIComms<ExecutionSpace, MemorySpace> _comms;
+  std::size_t _num_neighbors;
+  std::size_t _src_size;
+  std::size_t _tgt_size;
+};
+
+template <typename MemorySpace>
+struct ArborX::AccessTraits<TargetPoints<MemorySpace>, ArborX::PredicatesTag>
+{
+  static KOKKOS_FUNCTION std::size_t size(TargetPoints<MemorySpace> const &tp)
+  {
+    return tp.target_points.extent(0);
+  }
+
+  static KOKKOS_FUNCTION auto get(TargetPoints<MemorySpace> const &tp,
+                                  std::size_t i)
+  {
+    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
+  }
+
+  using memory_space = MemorySpace;
+};
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 02c6c580a..ac13df190 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -24,16 +24,18 @@ template <typename ValueType, typename PolynomialBasis, typename RBF,
 class MLSComputation
 {
 public:
+  MLSComputation() = default;
+
   MLSComputation(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
       Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+      : _num_neighbors(source_points.extent(0) / target_points.extent(0))
+      , _num_targets(target_points.extent(0))
   {
     // There must be a list of num_neighbors source points for each
     // target point
-    _num_neighbors = source_points.extent(0) / target_points.extent(0);
-    assert(source_points.extent(0) == target_points.extent(0) * _num_neighbors);
-    _num_targets = target_points.extent(0);
+    assert(source_points.extent(0) == _num_targets * _num_neighbors);
 
     auto source_ref_target =
         translate_to_target(space, source_points, target_points);
@@ -51,13 +53,15 @@ class MLSComputation
   }
 
   Kokkos::View<ValueType *>
-  eval(ExecutionSpace const &space,
-       Kokkos::View<ValueType *, MemorySpace> const &source_values)
+  evaluate(ExecutionSpace const &space,
+           Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
+    assert(source_values.extent(0) == _num_targets * _num_neighbors);
+
     Kokkos::View<ValueType *, MemorySpace> target_values(
-        "Example::MLS::target_values", _num_targets);
+        "Example::MLSC::target_values", _num_targets);
     Kokkos::parallel_for(
-        "Example::MLS::target_interpolation",
+        "Example::MLSC::target_interpolation",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
         KOKKOS_LAMBDA(int const i) {
           ValueType tmp = _zero;
@@ -81,10 +85,10 @@ class MLSComputation
     // optimize the final computation
     Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLS::source_ref_target"),
+                           "Example::MLSC::source_ref_target"),
         _num_targets, _num_neighbors);
     Kokkos::parallel_for(
-        "Example::MLS::source_ref_target_fill",
+        "Example::MLSC::source_ref_target_fill",
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
@@ -105,10 +109,10 @@ class MLSComputation
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
   {
     Kokkos::View<ValueType *, MemorySpace> radii(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::radii"),
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
         _num_targets);
     Kokkos::parallel_for(
-        "Example::MLS::radii_computation",
+        "Example::MLSC::radii_computation",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
         KOKKOS_LAMBDA(int const i) {
           ValueType radius = _ten * _epsilon;
@@ -130,10 +134,10 @@ class MLSComputation
       Kokkos::View<ValueType *, MemorySpace> const &radii)
   {
     Kokkos::View<ValueType **, MemorySpace> phi(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::phi"),
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
         _num_targets, _num_neighbors);
     Kokkos::parallel_for(
-        "Example::MLS::phi_computation",
+        "Example::MLSC::phi_computation",
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
@@ -154,10 +158,10 @@ class MLSComputation
     // automatically?
     Kokkos::View<ValueType ***, MemorySpace> p(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLS::vandermonde"),
+                           "Example::MLSC::vandermonde"),
         _num_targets, _num_neighbors, PolynomialBasis::size);
     Kokkos::parallel_for(
-        "Example::MLS::vandermonde_computation",
+        "Example::MLSC::vandermonde_computation",
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
@@ -177,10 +181,11 @@ class MLSComputation
                  Kokkos::View<ValueType ***, MemorySpace> const &p)
   {
     Kokkos::View<ValueType ***, MemorySpace> a(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLS::moment"),
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::moment"),
         _num_targets, PolynomialBasis::size, PolynomialBasis::size);
     Kokkos::parallel_for(
-        "Example::MLS::moment_computation",
+        "Example::MLSC::moment_computation",
         Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
             space, {0, 0, 0},
             {_num_targets, PolynomialBasis::size, PolynomialBasis::size}),
@@ -204,10 +209,10 @@ class MLSComputation
   {
     _coeffs = Kokkos::View<ValueType **, MemorySpace>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLS::coefficients"),
+                           "Example::MLSC::coefficients"),
         _num_targets, _num_neighbors);
     Kokkos::parallel_for(
-        "Example::MLS::coefficients",
+        "Example::MLSC::coefficients",
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index a9b7ae8f4..f77fa8567 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -18,12 +18,9 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <cmath>
-#include <limits>
 #include <sstream>
 
-#include "mls_computation.hpp"
-#include "mpi_comms.hpp"
+#include "mls.hpp"
 #include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
@@ -53,28 +50,6 @@ struct MVPolynomialBasis_3D
   }
 };
 
-struct TargetPoints
-{
-  Kokkos::View<ArborX::Point *, MemorySpace> target_points;
-  std::size_t num_neighbors;
-};
-
-template <>
-struct ArborX::AccessTraits<TargetPoints, ArborX::PredicatesTag>
-{
-  static KOKKOS_FUNCTION std::size_t size(TargetPoints const &tp)
-  {
-    return tp.target_points.extent(0);
-  }
-
-  static KOKKOS_FUNCTION auto get(TargetPoints const &tp, std::size_t i)
-  {
-    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
-  }
-
-  using memory_space = MemorySpace;
-};
-
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
@@ -86,7 +61,6 @@ int main(int argc, char *argv[])
   MPI_Init(&argc, &argv);
   Kokkos::ScopeGuard guard(argc, argv);
 
-  constexpr float epsilon = std::numeric_limits<float>::epsilon();
   constexpr std::size_t num_neighbors = MVPolynomialBasis_3D::size;
   constexpr std::size_t cube_side = 20;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
@@ -129,36 +103,9 @@ int main(int argc, char *argv[])
   target_points_host(3) = ArborX::Point{1.f, -3.3f, 7.f};
   Kokkos::deep_copy(space, target_points, target_points_host);
 
-  // Organize source points as tree
-  ArborX::DistributedTree<MemorySpace> source_tree(mpi_comm, space,
-                                                   source_points);
-
-  // Perform the query and split the indices/ranks
-  Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
-      "Example::index_ranks", 0);
-  Kokkos::View<int *, MemorySpace> offsets("Example::offsets", 0);
-  source_tree.query(space, TargetPoints{target_points, num_neighbors},
-                    index_ranks, offsets);
-  Kokkos::View<int *, MemorySpace> local_indices(
-      "Example::local_indices", target_points_num * num_neighbors);
-  Kokkos::View<int *, MemorySpace> local_ranks(
-      "Example::local_ranks", target_points_num * num_neighbors);
-  Kokkos::parallel_for(
-      "Example::index_ranks_split",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0,
-                                          target_points_num * num_neighbors),
-      KOKKOS_LAMBDA(int const i) {
-        local_indices(i) = index_ranks(i).first;
-        local_ranks(i) = index_ranks(i).second;
-      });
-
-  MPIComms<ExecutionSpace, MemorySpace> comms(space, mpi_comm, local_indices,
-                                              local_ranks);
-  auto local_source_points = comms.distribute(space, source_points);
-
-  MLSComputation<float, MVPolynomialBasis_3D, RBFWendland_0, ExecutionSpace,
-                 MemorySpace>
-      mlsc(space, local_source_points, target_points);
+  // Create the transform from a point cloud to another
+  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, ExecutionSpace, MemorySpace>
+      mls(space, mpi_comm, num_neighbors, source_points, target_points);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
@@ -170,9 +117,8 @@ int main(int argc, char *argv[])
         source_values(i) = manufactured_solution(source_points(i));
       });
 
-  auto local_source_values = comms.distribute(space, source_values);
-
-  auto target_values = mlsc.eval(space, local_source_values);
+  // Compute target values from source ones
+  auto target_values = mls.evaluate(space, source_values);
 
   // Compute target values via evaluation
   Kokkos::View<float *, MemorySpace> target_values_exact(
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index 96aef8c0e..754b3db4d 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -23,6 +23,10 @@ template <typename ExecutionSpace, typename MemorySpace>
 class MPIComms
 {
 public:
+  MPIComms(MPI_Comm comm)
+      : _distributor_back(comm)
+  {}
+
   MPIComms(ExecutionSpace const &space, MPI_Comm comm,
            Kokkos::View<int *, MemorySpace> indices,
            Kokkos::View<int *, MemorySpace> ranks)

From 1e6d4a5bb9a152dda56678a040b5056ddc654a24 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Mon, 14 Aug 2023 16:17:44 -0400
Subject: [PATCH 27/44] Better RBF

---
 examples/moving_least_squares/mls_computation.hpp      | 3 +--
 examples/moving_least_squares/moving_least_squares.cpp | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index ac13df190..cd2358096 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -141,10 +141,9 @@ class MLSComputation
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
-          RBF rbf{radii(i)};
           ValueType norm =
               ArborX::Details::distance(source_ref_target(i, j), _origin);
-          phi(i, j) = rbf(norm);
+          phi(i, j) = RBF::evaluate(norm / radii(i));
         });
 
     return phi;
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index f77fa8567..108c6f841 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -29,13 +29,10 @@ using DeviceSpace = Kokkos::Device<ExecutionSpace, MemorySpace>;
 
 struct RBFWendland_0
 {
-  KOKKOS_INLINE_FUNCTION float operator()(float x)
+  KOKKOS_INLINE_FUNCTION static float evaluate(float x)
   {
-    x /= _radius;
     return (1.f - x) * (1.f - x);
   }
-
-  float _radius;
 };
 
 struct MVPolynomialBasis_3D

From 52fe5e11c57304913dcd0c62360346a234a02552 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 09:41:56 -0400
Subject: [PATCH 28/44] Removed DeviceType, duplicated communicators

---
 examples/moving_least_squares/mls.hpp         |  7 ++-
 .../moving_least_squares/mls_computation.hpp  |  6 +--
 .../moving_least_squares.cpp                  |  5 +-
 examples/moving_least_squares/mpi_comms.hpp   | 50 ++++++++++++-------
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 1ffb90c90..39ad86f8a 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -38,7 +38,6 @@ class MLS
       : _num_neighbors(num_neighbors)
       , _src_size(source_points.extent(0))
       , _tgt_size(target_points.extent(0))
-      , _comms(comm)
   {
     // There must be enough source points
     assert(_src_size >= _num_neighbors);
@@ -81,11 +80,11 @@ class MLS
   }
 
   Kokkos::View<ValueType *, MemorySpace>
-  evaluate(ExecutionSpace const &space,
-           Kokkos::View<ValueType *, MemorySpace> const &source_values)
+  apply(ExecutionSpace const &space,
+        Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
     assert(source_values.extent(0) == _src_size);
-    return _mlsc.evaluate(space, _comms.distribute(space, source_values));
+    return _mlsc.apply(space, _comms.distribute(space, source_values));
   }
 
 private:
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index cd2358096..032fda78f 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -53,8 +53,8 @@ class MLSComputation
   }
 
   Kokkos::View<ValueType *>
-  evaluate(ExecutionSpace const &space,
-           Kokkos::View<ValueType *, MemorySpace> const &source_values)
+  apply(ExecutionSpace const &space,
+        Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
     assert(source_values.extent(0) == _num_targets * _num_neighbors);
 
@@ -143,7 +143,7 @@ class MLSComputation
         KOKKOS_LAMBDA(int const i, int const j) {
           ValueType norm =
               ArborX::Details::distance(source_ref_target(i, j), _origin);
-          phi(i, j) = RBF::evaluate(norm / radii(i));
+          phi(i, j) = RBF::apply(norm / radii(i));
         });
 
     return phi;
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 108c6f841..425465b2d 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -25,11 +25,10 @@
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
-using DeviceSpace = Kokkos::Device<ExecutionSpace, MemorySpace>;
 
 struct RBFWendland_0
 {
-  KOKKOS_INLINE_FUNCTION static float evaluate(float x)
+  KOKKOS_INLINE_FUNCTION static float apply(float x)
   {
     return (1.f - x) * (1.f - x);
   }
@@ -115,7 +114,7 @@ int main(int argc, char *argv[])
       });
 
   // Compute target values from source ones
-  auto target_values = mls.evaluate(space, source_values);
+  auto target_values = mls.apply(space, source_values);
 
   // Compute target values via evaluation
   Kokkos::View<float *, MemorySpace> target_values_exact(
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index 754b3db4d..e83bb1565 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -16,6 +16,8 @@
 #include <Kokkos_Core.hpp>
 
 #include <cassert>
+#include <memory>
+#include <optional>
 
 #include <mpi.h>
 
@@ -23,19 +25,31 @@ template <typename ExecutionSpace, typename MemorySpace>
 class MPIComms
 {
 public:
-  MPIComms(MPI_Comm comm)
-      : _distributor_back(comm)
-  {}
+  MPIComms() = default;
 
   MPIComms(ExecutionSpace const &space, MPI_Comm comm,
            Kokkos::View<int *, MemorySpace> indices,
            Kokkos::View<int *, MemorySpace> ranks)
-      : _distributor_back(comm)
   {
     assert(indices.extent(0) == ranks.extent(0));
     std::size_t data_len = indices.extent(0);
+
+    _comm.reset(
+        [comm]() {
+          auto p = new MPI_Comm;
+          MPI_Comm_dup(comm, p);
+          return p;
+        }(),
+        [](MPI_Comm *p) {
+          int mpi_finalized;
+          MPI_Finalized(&mpi_finalized);
+          if (!mpi_finalized)
+            MPI_Comm_free(p);
+          delete p;
+        });
+
     int rank;
-    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_rank(*_comm, &rank);
 
     Kokkos::View<int *, MemorySpace> mpi_tmp(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MPI::tmp"),
@@ -50,7 +64,7 @@ class MPIComms
 
     // This builds for each process a local array indicating how much
     // informatiom will be gathered
-    ArborX::Details::Distributor<device> distributor_forth(comm);
+    ArborX::Details::Distributor<MemorySpace> distributor_forth(*_comm);
     _num_requests = distributor_forth.createFromSends(space, ranks);
 
     // This creates the temporary buffer that will help when producing the
@@ -60,7 +74,7 @@ class MPIComms
                            "Example::MPI::rev_indices"),
         _num_requests);
     ArborX::iota(space, mpi_tmp);
-    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
         space, distributor_forth, mpi_tmp, mpi_rev_indices);
 
     // This retrieves which source index a process wants and gives it to
@@ -69,7 +83,7 @@ class MPIComms
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MPI::send_indices"),
         _num_requests);
-    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
         space, distributor_forth, indices, _mpi_send_indices);
 
     // This builds the temporary buffer that will create the reverse
@@ -79,11 +93,12 @@ class MPIComms
                            "Example::MPI::rev_ranks"),
         _num_requests);
     Kokkos::deep_copy(space, mpi_tmp, rank);
-    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
         space, distributor_forth, mpi_tmp, mpi_rev_ranks);
 
     // This will create the reverse of the previous distributor
-    _num_responses = _distributor_back.createFromSends(space, mpi_rev_ranks);
+    _distributor_back = ArborX::Details::Distributor<MemorySpace>(*_comm);
+    _num_responses = _distributor_back->createFromSends(space, mpi_rev_ranks);
 
     // There should be enough responses to perfectly fill what was requested
     assert(_num_responses == data_len);
@@ -94,8 +109,8 @@ class MPIComms
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MPI::recv_indices"),
         _num_responses);
-    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
-        space, _distributor_back, mpi_rev_indices, _mpi_recv_indices);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, *_distributor_back, mpi_rev_indices, _mpi_recv_indices);
   }
 
   template <typename ValueType>
@@ -103,6 +118,8 @@ class MPIComms
   distribute(ExecutionSpace const &space,
              Kokkos::View<ValueType *, MemorySpace> const &source)
   {
+    assert(_distributor_back.has_value());
+
     // We know what each process want so we prepare the data to be sent
     Kokkos::View<ValueType *, MemorySpace> data_to_send(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
@@ -121,8 +138,8 @@ class MPIComms
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MPI::data_to_recv"),
         _num_responses);
-    ArborX::Details::DistributedTreeImpl<device>::sendAcrossNetwork(
-        space, _distributor_back, data_to_send, data_to_recv);
+    ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
+        space, *_distributor_back, data_to_send, data_to_recv);
 
     // So we fix this by moving everything
     Kokkos::View<ValueType *, MemorySpace> output(
@@ -139,11 +156,10 @@ class MPIComms
   }
 
 private:
-  using device = Kokkos::Device<ExecutionSpace, MemorySpace>;
-
+  std::shared_ptr<MPI_Comm> _comm;
   Kokkos::View<int *, MemorySpace> _mpi_send_indices;
   Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
-  ArborX::Details::Distributor<device> _distributor_back;
+  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor_back;
   std::size_t _num_requests;
   std::size_t _num_responses;
 };
\ No newline at end of file

From db39716690710bd9031d2503893649ef908624f1 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 09:53:40 -0400
Subject: [PATCH 29/44] Style fixes and num_neighbors as an optional arg

---
 examples/moving_least_squares/mls.hpp         |  5 +--
 .../moving_least_squares/mls_computation.hpp  | 36 +++++++++----------
 .../moving_least_squares.cpp                  |  3 +-
 .../symmetric_pseudoinverse_svd.hpp           | 21 ++++++-----
 4 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 39ad86f8a..57ad2f961 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -32,9 +32,10 @@ template <typename ValueType, typename PolynomialBasis, typename RBF,
 class MLS
 {
 public:
-  MLS(ExecutionSpace const &space, MPI_Comm comm, std::size_t num_neighbors,
+  MLS(ExecutionSpace const &space, MPI_Comm comm,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points,
+      std::size_t num_neighbors = PolynomialBasis::size)
       : _num_neighbors(num_neighbors)
       , _src_size(source_points.extent(0))
       , _tgt_size(target_points.extent(0))
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 032fda78f..8b1cd97ec 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -38,18 +38,18 @@ class MLSComputation
     assert(source_points.extent(0) == _num_targets * _num_neighbors);
 
     auto source_ref_target =
-        translate_to_target(space, source_points, target_points);
+        translateToTarget(space, source_points, target_points);
 
-    auto radii = compute_radii(space, source_ref_target);
-    auto phi = compute_weight(space, source_ref_target, radii);
-    auto p = compute_vandermonde(space, source_ref_target);
+    auto radii = computeRadii(space, source_ref_target);
+    auto phi = computeWeight(space, source_ref_target, radii);
+    auto p = computeVandermonde(space, source_ref_target);
 
-    auto a = compute_moment(space, phi, p);
+    auto a = computeMoment(space, phi, p);
     auto a_inv =
         SymmPseudoInverseSVD<ValueType, ExecutionSpace,
-                             MemorySpace>::compute_pseudo_inverses(space, a);
+                             MemorySpace>::computePseudoInverses(space, a);
 
-    compute_coefficients(space, phi, p, a_inv);
+    computeCoefficients(space, phi, p, a_inv);
   }
 
   Kokkos::View<ValueType *>
@@ -76,7 +76,7 @@ class MLSComputation
   }
 
 private:
-  Kokkos::View<ArborX::Point **, MemorySpace> translate_to_target(
+  Kokkos::View<ArborX::Point **, MemorySpace> translateToTarget(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
       Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
@@ -104,7 +104,7 @@ class MLSComputation
     return source_ref_target;
   }
 
-  Kokkos::View<ValueType *, MemorySpace> compute_radii(
+  Kokkos::View<ValueType *, MemorySpace> computeRadii(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
   {
@@ -128,7 +128,7 @@ class MLSComputation
     return radii;
   }
 
-  Kokkos::View<ValueType **, MemorySpace> compute_weight(
+  Kokkos::View<ValueType **, MemorySpace> computeWeight(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
       Kokkos::View<ValueType *, MemorySpace> const &radii)
@@ -149,7 +149,7 @@ class MLSComputation
     return phi;
   }
 
-  Kokkos::View<ValueType ***, MemorySpace> compute_vandermonde(
+  Kokkos::View<ValueType ***, MemorySpace> computeVandermonde(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
   {
@@ -175,9 +175,9 @@ class MLSComputation
   }
 
   Kokkos::View<ValueType ***, MemorySpace>
-  compute_moment(ExecutionSpace const &space,
-                 Kokkos::View<ValueType **, MemorySpace> const &phi,
-                 Kokkos::View<ValueType ***, MemorySpace> const &p)
+  computeMoment(ExecutionSpace const &space,
+                Kokkos::View<ValueType **, MemorySpace> const &phi,
+                Kokkos::View<ValueType ***, MemorySpace> const &p)
   {
     Kokkos::View<ValueType ***, MemorySpace> a(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
@@ -201,10 +201,10 @@ class MLSComputation
   }
 
   void
-  compute_coefficients(ExecutionSpace const &space,
-                       Kokkos::View<ValueType **, MemorySpace> const &phi,
-                       Kokkos::View<ValueType ***, MemorySpace> const &p,
-                       Kokkos::View<ValueType ***, MemorySpace> const &a_inv)
+  computeCoefficients(ExecutionSpace const &space,
+                      Kokkos::View<ValueType **, MemorySpace> const &phi,
+                      Kokkos::View<ValueType ***, MemorySpace> const &p,
+                      Kokkos::View<ValueType ***, MemorySpace> const &a_inv)
   {
     _coeffs = Kokkos::View<ValueType **, MemorySpace>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 425465b2d..8ea6f0bb1 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -57,7 +57,6 @@ int main(int argc, char *argv[])
   MPI_Init(&argc, &argv);
   Kokkos::ScopeGuard guard(argc, argv);
 
-  constexpr std::size_t num_neighbors = MVPolynomialBasis_3D::size;
   constexpr std::size_t cube_side = 20;
   constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
   constexpr std::size_t target_points_num = 4;
@@ -101,7 +100,7 @@ int main(int argc, char *argv[])
 
   // Create the transform from a point cloud to another
   MLS<float, MVPolynomialBasis_3D, RBFWendland_0, ExecutionSpace, MemorySpace>
-      mls(space, mpi_comm, num_neighbors, source_points, target_points);
+      mls(space, mpi_comm, source_points, target_points);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
index 6be2cf5d7..833d6c5aa 100644
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -27,8 +27,8 @@ class SymmPseudoInverseSVD
 {
 public:
   static Kokkos::View<ValueType ***, MemorySpace>
-  compute_pseudo_inverses(ExecutionSpace const &space,
-                          Kokkos::View<ValueType ***, MemorySpace> const &mats)
+  computePseudoInverses(ExecutionSpace const &space,
+                        Kokkos::View<ValueType ***, MemorySpace> const &mats)
   {
     SymmPseudoInverseSVD spis(space, mats);
 
@@ -41,11 +41,11 @@ class SymmPseudoInverseSVD
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, spis._num_matrices),
         KOKKOS_LAMBDA(int const i) {
           int p, q;
-          ValueType norm = spis.argmax_off_diagonal(i, p, q);
+          ValueType norm = spis.argmaxOffDiagonal(i, p, q);
           while (norm > spis._epsilon)
           {
-            spis.compute_u_es_single(i, p, q);
-            norm = spis.argmax_off_diagonal(i, p, q);
+            spis.computeUESSingle(i, p, q);
+            norm = spis.argmaxOffDiagonal(i, p, q);
           }
         });
 
@@ -56,7 +56,7 @@ class SymmPseudoInverseSVD
         Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
             space, {0, 0, 0}, {spis._num_matrices, spis._size, spis._size}),
         KOKKOS_LAMBDA(int const i, int const j, int const k) {
-          spis.fill_inv(i, j, k);
+          spis.fillInv(i, j, k);
         });
 
     return spis._inv;
@@ -65,7 +65,7 @@ class SymmPseudoInverseSVD
 private:
   // U and E.S are computed, we can now build the inverse
   // U . [ E^-1.S ] . U^T
-  KOKKOS_FUNCTION void fill_inv(int const i, int const j, int const k) const
+  KOKKOS_FUNCTION void fillInv(int const i, int const j, int const k) const
   {
     ValueType value = _zero;
     for (int l = 0; l < _size; l++)
@@ -82,8 +82,8 @@ class SymmPseudoInverseSVD
 
   // We found the biggest value in our off-diagonal. We will remove it by
   // computing a "local" svd and update U and E.S
-  KOKKOS_FUNCTION void compute_u_es_single(int const i, int const p,
-                                           int const q) const
+  KOKKOS_FUNCTION void computeUESSingle(int const i, int const p,
+                                        int const q) const
   {
     ValueType a = _es(i, p, p);
     ValueType b = _es(i, p, q);
@@ -162,8 +162,7 @@ class SymmPseudoInverseSVD
   // This finds the biggest off-diagonal value of E.S as well as its
   // coordinates. Being symmetric, we can always check on the upper
   // triangle (and always have q > p)
-  KOKKOS_FUNCTION ValueType argmax_off_diagonal(int const i, int &p,
-                                                int &q) const
+  KOKKOS_FUNCTION ValueType argmaxOffDiagonal(int const i, int &p, int &q) const
   {
     ValueType max = _zero;
     p = q = 0;

From 03b600c94791ea4c0c8906b5654e4ae33d890cff Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 10:03:22 -0400
Subject: [PATCH 30/44] Moving ExecutionSpace templates

---
 examples/moving_least_squares/mls.hpp              | 13 +++++++------
 examples/moving_least_squares/mls_computation.hpp  | 14 +++++++++++---
 .../moving_least_squares/moving_least_squares.cpp  |  2 +-
 examples/moving_least_squares/mpi_comms.hpp        |  5 +++--
 .../symmetric_pseudoinverse_svd.hpp                |  5 +++--
 5 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 57ad2f961..05a461933 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -28,10 +28,11 @@ struct TargetPoints
 };
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
-          typename ExecutionSpace, typename MemorySpace>
+          typename MemorySpace>
 class MLS
 {
 public:
+  template <typename ExecutionSpace>
   MLS(ExecutionSpace const &space, MPI_Comm comm,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
       Kokkos::View<ArborX::Point *, MemorySpace> const &target_points,
@@ -70,16 +71,16 @@ class MLS
         });
 
     // Set up comms and local source points
-    _comms = MPIComms<ExecutionSpace, MemorySpace>(space, comm, local_indices,
-                                                   local_ranks);
+    _comms = MPIComms<MemorySpace>(space, comm, local_indices, local_ranks);
     auto local_source_points = _comms.distribute(space, source_points);
 
     // Compute the internal MLS
     _mlsc =
-        MLSComputation<ValueType, PolynomialBasis, RBF, ExecutionSpace,
+        MLSComputation<ValueType, PolynomialBasis, RBF,
                        MemorySpace>(space, local_source_points, target_points);
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType *, MemorySpace>
   apply(ExecutionSpace const &space,
         Kokkos::View<ValueType *, MemorySpace> const &source_values)
@@ -89,9 +90,9 @@ class MLS
   }
 
 private:
-  MLSComputation<ValueType, PolynomialBasis, RBF, ExecutionSpace, MemorySpace>
+  MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace>
       _mlsc;
-  MPIComms<ExecutionSpace, MemorySpace> _comms;
+  MPIComms<MemorySpace> _comms;
   std::size_t _num_neighbors;
   std::size_t _src_size;
   std::size_t _tgt_size;
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 8b1cd97ec..99e754770 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -20,12 +20,13 @@
 #include "symmetric_pseudoinverse_svd.hpp"
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
-          typename ExecutionSpace, typename MemorySpace>
+          typename MemorySpace>
 class MLSComputation
 {
 public:
   MLSComputation() = default;
 
+  template <typename ExecutionSpace>
   MLSComputation(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
@@ -46,12 +47,13 @@ class MLSComputation
 
     auto a = computeMoment(space, phi, p);
     auto a_inv =
-        SymmPseudoInverseSVD<ValueType, ExecutionSpace,
-                             MemorySpace>::computePseudoInverses(space, a);
+        SymmPseudoInverseSVD<ValueType, MemorySpace>::computePseudoInverses(
+            space, a);
 
     computeCoefficients(space, phi, p, a_inv);
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType *>
   apply(ExecutionSpace const &space,
         Kokkos::View<ValueType *, MemorySpace> const &source_values)
@@ -76,6 +78,7 @@ class MLSComputation
   }
 
 private:
+  template <typename ExecutionSpace>
   Kokkos::View<ArborX::Point **, MemorySpace> translateToTarget(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
@@ -104,6 +107,7 @@ class MLSComputation
     return source_ref_target;
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType *, MemorySpace> computeRadii(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
@@ -128,6 +132,7 @@ class MLSComputation
     return radii;
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType **, MemorySpace> computeWeight(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
@@ -149,6 +154,7 @@ class MLSComputation
     return phi;
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType ***, MemorySpace> computeVandermonde(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
@@ -174,6 +180,7 @@ class MLSComputation
     return p;
   }
 
+  template <typename ExecutionSpace>
   Kokkos::View<ValueType ***, MemorySpace>
   computeMoment(ExecutionSpace const &space,
                 Kokkos::View<ValueType **, MemorySpace> const &phi,
@@ -200,6 +207,7 @@ class MLSComputation
     return a;
   }
 
+  template <typename ExecutionSpace>
   void
   computeCoefficients(ExecutionSpace const &space,
                       Kokkos::View<ValueType **, MemorySpace> const &phi,
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 8ea6f0bb1..632ab1dac 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -99,7 +99,7 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Create the transform from a point cloud to another
-  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, ExecutionSpace, MemorySpace>
+  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, MemorySpace>
       mls(space, mpi_comm, source_points, target_points);
 
   // Compute source values
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index e83bb1565..492d0512d 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -21,12 +21,13 @@
 
 #include <mpi.h>
 
-template <typename ExecutionSpace, typename MemorySpace>
+template <typename MemorySpace>
 class MPIComms
 {
 public:
   MPIComms() = default;
 
+  template <typename ExecutionSpace>
   MPIComms(ExecutionSpace const &space, MPI_Comm comm,
            Kokkos::View<int *, MemorySpace> indices,
            Kokkos::View<int *, MemorySpace> ranks)
@@ -113,7 +114,7 @@ class MPIComms
         space, *_distributor_back, mpi_rev_indices, _mpi_recv_indices);
   }
 
-  template <typename ValueType>
+  template <typename ExecutionSpace, typename ValueType>
   Kokkos::View<ValueType *, MemorySpace>
   distribute(ExecutionSpace const &space,
              Kokkos::View<ValueType *, MemorySpace> const &source)
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
index 833d6c5aa..ade92775c 100644
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -22,10 +22,11 @@
 // We also know that A is symmetric (by construction), so U = SV where S is
 // a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
 // Thus A = U.E.S.U^T
-template <class ValueType, typename ExecutionSpace, typename MemorySpace>
+template <class ValueType, typename MemorySpace>
 class SymmPseudoInverseSVD
 {
 public:
+  template <typename ExecutionSpace>
   static Kokkos::View<ValueType ***, MemorySpace>
   computePseudoInverses(ExecutionSpace const &space,
                         Kokkos::View<ValueType ***, MemorySpace> const &mats)
@@ -183,7 +184,7 @@ class SymmPseudoInverseSVD
     return max;
   }
 
-  KOKKOS_FUNCTION
+  template <typename ExecutionSpace>
   SymmPseudoInverseSVD(ExecutionSpace const &space,
                        Kokkos::View<ValueType ***, MemorySpace> const &mats)
       : _num_matrices(mats.extent(0))

From dec46be7e0899fd3eb7a57dd9b024824160b474c Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 11:39:13 -0400
Subject: [PATCH 31/44] Swicthing to AccessTraits for user inputs (attempt)

---
 examples/moving_least_squares/common.hpp      | 24 +++++++
 examples/moving_least_squares/mls.hpp         | 71 ++++++++++---------
 .../moving_least_squares/mls_computation.hpp  | 37 ++++++----
 .../moving_least_squares.cpp                  |  4 +-
 examples/moving_least_squares/mpi_comms.hpp   | 54 +++++++++++---
 5 files changed, 128 insertions(+), 62 deletions(-)
 create mode 100644 examples/moving_least_squares/common.hpp

diff --git a/examples/moving_least_squares/common.hpp b/examples/moving_least_squares/common.hpp
new file mode 100644
index 000000000..2e3a32da2
--- /dev/null
+++ b/examples/moving_least_squares/common.hpp
@@ -0,0 +1,24 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <type_traits>
+
+namespace Details
+{
+template <typename T>
+using inner_value_t = std::decay_t<std::invoke_result_t<
+    decltype(ArborX::AccessTraits<T, ArborX::PrimitivesTag>::get), T const &,
+    int>>;
+} // namespace Details
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 05a461933..99a139b8b 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -20,26 +20,46 @@
 #include "mls_computation.hpp"
 #include "mpi_comms.hpp"
 
-template <typename MemorySpace>
+template <typename MemorySpace, typename Points>
 struct TargetPoints
 {
-  Kokkos::View<ArborX::Point *, MemorySpace> target_points;
+  Points target_points;
   std::size_t num_neighbors;
 };
 
+template <typename MemorySpace, typename Points>
+struct ArborX::AccessTraits<TargetPoints<MemorySpace, Points>,
+                            ArborX::PredicatesTag>
+{
+  static KOKKOS_FUNCTION std::size_t
+  size(TargetPoints<MemorySpace, Points> const &tp)
+  {
+    return tp.target_points.extent(0);
+  }
+
+  static KOKKOS_FUNCTION auto get(TargetPoints<MemorySpace, Points> const &tp,
+                                  std::size_t i)
+  {
+    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
+  }
+
+  using memory_space = MemorySpace;
+};
+
 template <typename ValueType, typename PolynomialBasis, typename RBF,
           typename MemorySpace>
 class MLS
 {
 public:
-  template <typename ExecutionSpace>
-  MLS(ExecutionSpace const &space, MPI_Comm comm,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points,
+  template <typename ExecutionSpace, typename Points>
+  MLS(ExecutionSpace const &space, MPI_Comm comm, Points const &source_points,
+      Points const &target_points,
       std::size_t num_neighbors = PolynomialBasis::size)
       : _num_neighbors(num_neighbors)
-      , _src_size(source_points.extent(0))
-      , _tgt_size(target_points.extent(0))
+      , _src_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            source_points))
+      , _tgt_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            target_points))
   {
     // There must be enough source points
     assert(_src_size >= _num_neighbors);
@@ -52,9 +72,9 @@ class MLS
     Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
         "Example::MLS::index_ranks", 0);
     Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
-    source_tree.query(space,
-                      TargetPoints<MemorySpace>{target_points, _num_neighbors},
-                      index_ranks, offsets);
+    source_tree.query(
+        space, TargetPoints<MemorySpace, Points>{target_points, _num_neighbors},
+        index_ranks, offsets);
 
     // Split indices/ranks
     Kokkos::View<int *, MemorySpace> local_indices(
@@ -72,12 +92,11 @@ class MLS
 
     // Set up comms and local source points
     _comms = MPIComms<MemorySpace>(space, comm, local_indices, local_ranks);
-    auto local_source_points = _comms.distribute(space, source_points);
+    auto local_source_points = _comms.distributeArborX(space, source_points);
 
     // Compute the internal MLS
-    _mlsc =
-        MLSComputation<ValueType, PolynomialBasis, RBF,
-                       MemorySpace>(space, local_source_points, target_points);
+    _mlsc = MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace>(
+        space, local_source_points, target_points);
   }
 
   template <typename ExecutionSpace>
@@ -86,31 +105,13 @@ class MLS
         Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
     assert(source_values.extent(0) == _src_size);
-    return _mlsc.apply(space, _comms.distribute(space, source_values));
+    return _mlsc.apply(space, _comms.distributeView(space, source_values));
   }
 
 private:
-  MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace>
-      _mlsc;
+  MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace> _mlsc;
   MPIComms<MemorySpace> _comms;
   std::size_t _num_neighbors;
   std::size_t _src_size;
   std::size_t _tgt_size;
-};
-
-template <typename MemorySpace>
-struct ArborX::AccessTraits<TargetPoints<MemorySpace>, ArborX::PredicatesTag>
-{
-  static KOKKOS_FUNCTION std::size_t size(TargetPoints<MemorySpace> const &tp)
-  {
-    return tp.target_points.extent(0);
-  }
-
-  static KOKKOS_FUNCTION auto get(TargetPoints<MemorySpace> const &tp,
-                                  std::size_t i)
-  {
-    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
-  }
-
-  using memory_space = MemorySpace;
 };
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 99e754770..fb1f641d0 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -17,6 +17,7 @@
 
 #include <cassert>
 
+#include "common.hpp"
 #include "symmetric_pseudoinverse_svd.hpp"
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
@@ -26,13 +27,17 @@ class MLSComputation
 public:
   MLSComputation() = default;
 
-  template <typename ExecutionSpace>
-  MLSComputation(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
-      : _num_neighbors(source_points.extent(0) / target_points.extent(0))
-      , _num_targets(target_points.extent(0))
+  template <typename ExecutionSpace, typename Points>
+  MLSComputation(ExecutionSpace const &space,
+                 Kokkos::View<Details::inner_value_t<Points> *,
+                              MemorySpace> const &source_points,
+                 Points const &target_points)
+      : _num_neighbors(
+            source_points.extent(0) /
+            ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+                target_points))
+      , _num_targets(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            target_points))
   {
     // There must be a list of num_neighbors source points for each
     // target point
@@ -78,12 +83,16 @@ class MLSComputation
   }
 
 private:
-  template <typename ExecutionSpace>
-  Kokkos::View<ArborX::Point **, MemorySpace> translateToTarget(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &source_points,
-      Kokkos::View<ArborX::Point *, MemorySpace> const &target_points)
+  template <typename ExecutionSpace, typename Points>
+  Kokkos::View<ArborX::Point **, MemorySpace>
+  translateToTarget(ExecutionSpace const &space,
+                    Kokkos::View<Details::inner_value_t<Points> *,
+                                 MemorySpace> const &source_points,
+                    Points const &target_points)
   {
+    using point_t = Details::inner_value_t<Points>;
+    using access = ArborX::AccessTraits<Points, ArborX::PrimitivesTag>;
+
     // We center each group around the target as it ables you to
     // optimize the final computation
     Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
@@ -95,8 +104,8 @@ class MLSComputation
         Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
-          ArborX::Point src = source_points(i * _num_neighbors + j);
-          ArborX::Point tgt = target_points(i);
+          point_t src = source_points(i * _num_neighbors + j);
+          point_t tgt = access::get(target_points, i);
           source_ref_target(i, j) = ArborX::Point{
               src[0] - tgt[0],
               src[1] - tgt[1],
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 632ab1dac..fb7262fe5 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -99,8 +99,8 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Create the transform from a point cloud to another
-  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, MemorySpace>
-      mls(space, mpi_comm, source_points, target_points);
+  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, MemorySpace> mls(
+      space, mpi_comm, source_points, target_points);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index 492d0512d..bcd01d98e 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -19,6 +19,7 @@
 #include <memory>
 #include <optional>
 
+#include "common.hpp"
 #include <mpi.h>
 
 template <typename MemorySpace>
@@ -114,10 +115,33 @@ class MPIComms
         space, *_distributor_back, mpi_rev_indices, _mpi_recv_indices);
   }
 
+  template <typename ExecutionSpace, typename Values>
+  Kokkos::View<Details::inner_value_t<Values> *, MemorySpace>
+  distributeArborX(ExecutionSpace const &space, Values const &source)
+  {
+    using value_t = Details::inner_value_t<Values>;
+    using access = ArborX::AccessTraits<Values, ArborX::PrimitivesTag>;
+    assert(_distributor_back.has_value());
+
+    // We know what each process want so we prepare the data to be sent
+    Kokkos::View<value_t *, MemorySpace> data_to_send(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MPI::data_to_send"),
+        _num_requests);
+    Kokkos::parallel_for(
+        "Example::MPI::data_to_send_fill",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
+        KOKKOS_CLASS_LAMBDA(int const i) {
+          data_to_send(i) = access::get(source, _mpi_send_indices(i));
+        });
+
+    return distribute(space, data_to_send);
+  }
+
   template <typename ExecutionSpace, typename ValueType>
   Kokkos::View<ValueType *, MemorySpace>
-  distribute(ExecutionSpace const &space,
-             Kokkos::View<ValueType *, MemorySpace> const &source)
+  distributeView(ExecutionSpace const &space,
+                 Kokkos::View<ValueType *, MemorySpace> const &source)
   {
     assert(_distributor_back.has_value());
 
@@ -133,7 +157,23 @@ class MPIComms
           data_to_send(i) = source(_mpi_send_indices(i));
         });
 
-    // Then we properly send it, and each process has what it wants, but in the
+    return distribute(space, data_to_send);
+  }
+
+private:
+  std::shared_ptr<MPI_Comm> _comm;
+  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
+  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
+  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor_back;
+  std::size_t _num_requests;
+  std::size_t _num_responses;
+
+  template <typename ExecutionSpace, typename ValueType>
+  Kokkos::View<ValueType *, MemorySpace>
+  distribute(ExecutionSpace const &space,
+             Kokkos::View<ValueType *, MemorySpace> const &data_to_send)
+  {
+    // We properly send the data, and each process has what it wants, but in the
     // wrong order
     Kokkos::View<ValueType *, MemorySpace> data_to_recv(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
@@ -155,12 +195,4 @@ class MPIComms
 
     return output;
   }
-
-private:
-  std::shared_ptr<MPI_Comm> _comm;
-  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
-  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
-  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor_back;
-  std::size_t _num_requests;
-  std::size_t _num_responses;
 };
\ No newline at end of file

From 210243db50ed6fb7a371a05b890eabf847a5ff1f Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 16:22:25 -0400
Subject: [PATCH 32/44] Simplification of traits access

---
 examples/moving_least_squares/common.hpp      |  8 +++--
 examples/moving_least_squares/mls.hpp         | 29 ++++++++-----------
 .../moving_least_squares/mls_computation.hpp  | 12 +++-----
 examples/moving_least_squares/mpi_comms.hpp   |  4 +--
 4 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/examples/moving_least_squares/common.hpp b/examples/moving_least_squares/common.hpp
index 2e3a32da2..df0c0abdd 100644
--- a/examples/moving_least_squares/common.hpp
+++ b/examples/moving_least_squares/common.hpp
@@ -18,7 +18,9 @@
 namespace Details
 {
 template <typename T>
-using inner_value_t = std::decay_t<std::invoke_result_t<
-    decltype(ArborX::AccessTraits<T, ArborX::PrimitivesTag>::get), T const &,
-    int>>;
+using access = ArborX::AccessTraits<T, ArborX::PrimitivesTag>;
+
+template <typename T>
+using inner_value_t = std::decay_t<
+    std::invoke_result_t<decltype(access<T>::get), T const &, int>>;
 } // namespace Details
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 99a139b8b..5c264f299 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -17,33 +17,31 @@
 
 #include <cassert>
 
+#include "common.hpp"
 #include "mls_computation.hpp"
 #include "mpi_comms.hpp"
 
-template <typename MemorySpace, typename Points>
+template <typename Points>
 struct TargetPoints
 {
   Points target_points;
   std::size_t num_neighbors;
 };
 
-template <typename MemorySpace, typename Points>
-struct ArborX::AccessTraits<TargetPoints<MemorySpace, Points>,
-                            ArborX::PredicatesTag>
+template <typename Points>
+struct ArborX::AccessTraits<TargetPoints<Points>, ArborX::PredicatesTag>
 {
-  static KOKKOS_FUNCTION std::size_t
-  size(TargetPoints<MemorySpace, Points> const &tp)
+  static KOKKOS_FUNCTION std::size_t size(TargetPoints<Points> const &tp)
   {
     return tp.target_points.extent(0);
   }
 
-  static KOKKOS_FUNCTION auto get(TargetPoints<MemorySpace, Points> const &tp,
-                                  std::size_t i)
+  static KOKKOS_FUNCTION auto get(TargetPoints<Points> const &tp, std::size_t i)
   {
     return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
   }
 
-  using memory_space = MemorySpace;
+  using memory_space = typename ::Details::access<Points>::memory_space;
 };
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
@@ -56,12 +54,9 @@ class MLS
       Points const &target_points,
       std::size_t num_neighbors = PolynomialBasis::size)
       : _num_neighbors(num_neighbors)
-      , _src_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            source_points))
-      , _tgt_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            target_points))
+      , _src_size(Details::access<Points>::size(source_points))
+      , _tgt_size(Details::access<Points>::size(target_points))
   {
-    // There must be enough source points
     assert(_src_size >= _num_neighbors);
 
     // Organize source points as tree
@@ -72,9 +67,9 @@ class MLS
     Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
         "Example::MLS::index_ranks", 0);
     Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
-    source_tree.query(
-        space, TargetPoints<MemorySpace, Points>{target_points, _num_neighbors},
-        index_ranks, offsets);
+    source_tree.query(space,
+                      TargetPoints<Points>{target_points, _num_neighbors},
+                      index_ranks, offsets);
 
     // Split indices/ranks
     Kokkos::View<int *, MemorySpace> local_indices(
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index fb1f641d0..7202cf42d 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -32,12 +32,9 @@ class MLSComputation
                  Kokkos::View<Details::inner_value_t<Points> *,
                               MemorySpace> const &source_points,
                  Points const &target_points)
-      : _num_neighbors(
-            source_points.extent(0) /
-            ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-                target_points))
-      , _num_targets(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            target_points))
+      : _num_neighbors(source_points.extent(0) /
+                       Details::access<Points>::size(target_points))
+      , _num_targets(Details::access<Points>::size(target_points))
   {
     // There must be a list of num_neighbors source points for each
     // target point
@@ -91,7 +88,6 @@ class MLSComputation
                     Points const &target_points)
   {
     using point_t = Details::inner_value_t<Points>;
-    using access = ArborX::AccessTraits<Points, ArborX::PrimitivesTag>;
 
     // We center each group around the target as it ables you to
     // optimize the final computation
@@ -105,7 +101,7 @@ class MLSComputation
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
           point_t src = source_points(i * _num_neighbors + j);
-          point_t tgt = access::get(target_points, i);
+          point_t tgt = Details::access<Points>::get(target_points, i);
           source_ref_target(i, j) = ArborX::Point{
               src[0] - tgt[0],
               src[1] - tgt[1],
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index bcd01d98e..ad6abb6f2 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -120,7 +120,6 @@ class MPIComms
   distributeArborX(ExecutionSpace const &space, Values const &source)
   {
     using value_t = Details::inner_value_t<Values>;
-    using access = ArborX::AccessTraits<Values, ArborX::PrimitivesTag>;
     assert(_distributor_back.has_value());
 
     // We know what each process want so we prepare the data to be sent
@@ -132,7 +131,8 @@ class MPIComms
         "Example::MPI::data_to_send_fill",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
         KOKKOS_CLASS_LAMBDA(int const i) {
-          data_to_send(i) = access::get(source, _mpi_send_indices(i));
+          data_to_send(i) =
+              Details::access<Values>::get(source, _mpi_send_indices(i));
         });
 
     return distribute(space, data_to_send);

From b1267dd455a893e24194de16f865de43badf790b Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 15 Aug 2023 16:43:45 -0400
Subject: [PATCH 33/44] Assertions for public interfaces

---
 examples/moving_least_squares/mls.hpp                | 12 ++++++++++++
 examples/moving_least_squares/mls_computation.hpp    | 11 +++++++++++
 examples/moving_least_squares/mpi_comms.hpp          | 12 ++++++++++++
 .../symmetric_pseudoinverse_svd.hpp                  |  5 +++++
 4 files changed, 40 insertions(+)

diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 5c264f299..999e27710 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <ArborX.hpp>
+#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
 
 #include <Kokkos_Core.hpp>
 
@@ -57,6 +58,15 @@ class MLS
       , _src_size(Details::access<Points>::size(source_points))
       , _tgt_size(Details::access<Points>::size(target_points))
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
+    static_assert(KokkosExt::is_accessible_from<
+                  typename Details::access<Points>::memory_space,
+                  ExecutionSpace>::value);
+    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
+                                               source_points);
+
+    // A minimum nuber of source points are needed
     assert(_src_size >= _num_neighbors);
 
     // Organize source points as tree
@@ -99,6 +109,8 @@ class MLS
   apply(ExecutionSpace const &space,
         Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
     assert(source_values.extent(0) == _src_size);
     return _mlsc.apply(space, _comms.distributeView(space, source_values));
   }
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 7202cf42d..24adc5710 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <ArborX.hpp>
+#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
 
 #include <Kokkos_Core.hpp>
 
@@ -36,6 +37,14 @@ class MLSComputation
                        Details::access<Points>::size(target_points))
       , _num_targets(Details::access<Points>::size(target_points))
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
+    static_assert(KokkosExt::is_accessible_from<
+                  typename Details::access<Points>::memory_space,
+                  ExecutionSpace>::value);
+    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
+                                               target_points);
+
     // There must be a list of num_neighbors source points for each
     // target point
     assert(source_points.extent(0) == _num_targets * _num_neighbors);
@@ -60,6 +69,8 @@ class MLSComputation
   apply(ExecutionSpace const &space,
         Kokkos::View<ValueType *, MemorySpace> const &source_values)
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
     assert(source_values.extent(0) == _num_targets * _num_neighbors);
 
     Kokkos::View<ValueType *, MemorySpace> target_values(
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index ad6abb6f2..f32eddb9a 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <ArborX.hpp>
+#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
 
 #include <Kokkos_Core.hpp>
 
@@ -33,6 +34,8 @@ class MPIComms
            Kokkos::View<int *, MemorySpace> indices,
            Kokkos::View<int *, MemorySpace> ranks)
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
     assert(indices.extent(0) == ranks.extent(0));
     std::size_t data_len = indices.extent(0);
 
@@ -120,6 +123,13 @@ class MPIComms
   distributeArborX(ExecutionSpace const &space, Values const &source)
   {
     using value_t = Details::inner_value_t<Values>;
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
+    static_assert(KokkosExt::is_accessible_from<
+                  typename Details::access<Values>::memory_space,
+                  ExecutionSpace>::value);
+    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{}, source);
+
     assert(_distributor_back.has_value());
 
     // We know what each process want so we prepare the data to be sent
@@ -143,6 +153,8 @@ class MPIComms
   distributeView(ExecutionSpace const &space,
                  Kokkos::View<ValueType *, MemorySpace> const &source)
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
     assert(_distributor_back.has_value());
 
     // We know what each process want so we prepare the data to be sent
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
index ade92775c..e901f660c 100644
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
@@ -11,6 +11,8 @@
 
 #pragma once
 
+#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
+
 #include <Kokkos_Core.hpp>
 
 #include <cassert>
@@ -31,6 +33,9 @@ class SymmPseudoInverseSVD
   computePseudoInverses(ExecutionSpace const &space,
                         Kokkos::View<ValueType ***, MemorySpace> const &mats)
   {
+    static_assert(
+        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
+
     SymmPseudoInverseSVD spis(space, mats);
 
     // Iterative approach, we will "deconstruct" E.S until only the diagonal

From 9315a4c1a33d7cfe965c5be4342253025fa55fe7 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 17 Aug 2023 11:57:22 -0400
Subject: [PATCH 34/44] Replacing code using ArborX's code and convention

---
 examples/moving_least_squares/common.hpp      | 26 ----------
 examples/moving_least_squares/mls.hpp         | 47 ++++++++++++-------
 .../moving_least_squares/mls_computation.hpp  | 45 +++++++++++-------
 .../moving_least_squares.cpp                  |  2 +-
 examples/moving_least_squares/mpi_comms.hpp   | 21 +++++----
 5 files changed, 71 insertions(+), 70 deletions(-)
 delete mode 100644 examples/moving_least_squares/common.hpp

diff --git a/examples/moving_least_squares/common.hpp b/examples/moving_least_squares/common.hpp
deleted file mode 100644
index df0c0abdd..000000000
--- a/examples/moving_least_squares/common.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/****************************************************************************
- * Copyright (c) 2023 by the ArborX authors                                 *
- * All rights reserved.                                                     *
- *                                                                          *
- * This file is part of the ArborX library. ArborX is                       *
- * distributed under a BSD 3-clause license. For the licensing terms see    *
- * the LICENSE file in the top-level directory.                             *
- *                                                                          *
- * SPDX-License-Identifier: BSD-3-Clause                                    *
- ****************************************************************************/
-
-#pragma once
-
-#include <ArborX.hpp>
-
-#include <type_traits>
-
-namespace Details
-{
-template <typename T>
-using access = ArborX::AccessTraits<T, ArborX::PrimitivesTag>;
-
-template <typename T>
-using inner_value_t = std::decay_t<
-    std::invoke_result_t<decltype(access<T>::get), T const &, int>>;
-} // namespace Details
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 999e27710..0566515cc 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -18,7 +18,6 @@
 
 #include <cassert>
 
-#include "common.hpp"
 #include "mls_computation.hpp"
 #include "mpi_comms.hpp"
 
@@ -34,15 +33,21 @@ struct ArborX::AccessTraits<TargetPoints<Points>, ArborX::PredicatesTag>
 {
   static KOKKOS_FUNCTION std::size_t size(TargetPoints<Points> const &tp)
   {
-    return tp.target_points.extent(0);
+    return ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+        tp.target_points);
   }
 
   static KOKKOS_FUNCTION auto get(TargetPoints<Points> const &tp, std::size_t i)
   {
-    return ArborX::nearest(tp.target_points(i), tp.num_neighbors);
+    return ArborX::nearest(
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
+            tp.target_points, i),
+        tp.num_neighbors);
   }
 
-  using memory_space = typename ::Details::access<Points>::memory_space;
+  using memory_space =
+      typename ArborX::AccessTraits<Points,
+                                    ArborX::PrimitivesTag>::memory_space;
 };
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
@@ -51,18 +56,22 @@ class MLS
 {
 public:
   template <typename ExecutionSpace, typename Points>
-  MLS(ExecutionSpace const &space, MPI_Comm comm, Points const &source_points,
+  MLS(MPI_Comm comm, ExecutionSpace const &space, Points const &source_points,
       Points const &target_points,
       std::size_t num_neighbors = PolynomialBasis::size)
       : _num_neighbors(num_neighbors)
-      , _src_size(Details::access<Points>::size(source_points))
-      , _tgt_size(Details::access<Points>::size(target_points))
+      , _src_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            source_points))
+      , _tgt_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            target_points))
   {
     static_assert(
         KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(KokkosExt::is_accessible_from<
-                  typename Details::access<Points>::memory_space,
-                  ExecutionSpace>::value);
+    static_assert(
+        KokkosExt::is_accessible_from<
+            typename ArborX::AccessTraits<Points,
+                                          ArborX::PrimitivesTag>::memory_space,
+            ExecutionSpace>::value);
     ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
                                                source_points);
 
@@ -74,7 +83,7 @@ class MLS
                                                      source_points);
 
     // Perform the query
-    Kokkos::View<Kokkos::pair<int, int> *, MemorySpace> index_ranks(
+    Kokkos::View<ArborX::PairIndexRank *, MemorySpace> index_ranks(
         "Example::MLS::index_ranks", 0);
     Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
     source_tree.query(space,
@@ -83,20 +92,24 @@ class MLS
 
     // Split indices/ranks
     Kokkos::View<int *, MemorySpace> local_indices(
-        "Example::MLS::local_indices", _tgt_size * _num_neighbors);
-    Kokkos::View<int *, MemorySpace> local_ranks("Example::MLS::local_ranks",
-                                                 _tgt_size * _num_neighbors);
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLS::local_indices"),
+        _tgt_size * _num_neighbors);
+    Kokkos::View<int *, MemorySpace> local_ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLS::local_ranks"),
+        _tgt_size * _num_neighbors);
     Kokkos::parallel_for(
         "Example::MLS::index_ranks_split",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0,
                                             _tgt_size * _num_neighbors),
         KOKKOS_LAMBDA(int const i) {
-          local_indices(i) = index_ranks(i).first;
-          local_ranks(i) = index_ranks(i).second;
+          local_indices(i) = index_ranks(i).index;
+          local_ranks(i) = index_ranks(i).rank;
         });
 
     // Set up comms and local source points
-    _comms = MPIComms<MemorySpace>(space, comm, local_indices, local_ranks);
+    _comms = MPIComms<MemorySpace>(comm, space, local_indices, local_ranks);
     auto local_source_points = _comms.distributeArborX(space, source_points);
 
     // Compute the internal MLS
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 24adc5710..6734d2f1b 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -18,7 +18,6 @@
 
 #include <cassert>
 
-#include "common.hpp"
 #include "symmetric_pseudoinverse_svd.hpp"
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
@@ -29,24 +28,29 @@ class MLSComputation
   MLSComputation() = default;
 
   template <typename ExecutionSpace, typename Points>
-  MLSComputation(ExecutionSpace const &space,
-                 Kokkos::View<Details::inner_value_t<Points> *,
-                              MemorySpace> const &source_points,
-                 Points const &target_points)
-      : _num_neighbors(source_points.extent(0) /
-                       Details::access<Points>::size(target_points))
-      , _num_targets(Details::access<Points>::size(target_points))
+  MLSComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<
+          typename ArborX::Details::AccessTraitsHelper<
+              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type *,
+          MemorySpace> const &source_points,
+      Points const &target_points)
+      : _num_targets(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+            target_points))
   {
     static_assert(
         KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(KokkosExt::is_accessible_from<
-                  typename Details::access<Points>::memory_space,
-                  ExecutionSpace>::value);
+    static_assert(
+        KokkosExt::is_accessible_from<
+            typename ArborX::AccessTraits<Points,
+                                          ArborX::PrimitivesTag>::memory_space,
+            ExecutionSpace>::value);
     ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
                                                target_points);
 
     // There must be a list of num_neighbors source points for each
     // target point
+    _num_neighbors = source_points.extent(0) / _num_targets;
     assert(source_points.extent(0) == _num_targets * _num_neighbors);
 
     auto source_ref_target =
@@ -92,13 +96,16 @@ class MLSComputation
 
 private:
   template <typename ExecutionSpace, typename Points>
-  Kokkos::View<ArborX::Point **, MemorySpace>
-  translateToTarget(ExecutionSpace const &space,
-                    Kokkos::View<Details::inner_value_t<Points> *,
-                                 MemorySpace> const &source_points,
-                    Points const &target_points)
+  Kokkos::View<ArborX::Point **, MemorySpace> translateToTarget(
+      ExecutionSpace const &space,
+      Kokkos::View<
+          typename ArborX::Details::AccessTraitsHelper<
+              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type *,
+          MemorySpace> const &source_points,
+      Points const &target_points)
   {
-    using point_t = Details::inner_value_t<Points>;
+    using point_t = typename ArborX::Details::AccessTraitsHelper<
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type;
 
     // We center each group around the target as it ables you to
     // optimize the final computation
@@ -112,7 +119,9 @@ class MLSComputation
                                                {_num_targets, _num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
           point_t src = source_points(i * _num_neighbors + j);
-          point_t tgt = Details::access<Points>::get(target_points, i);
+          point_t tgt =
+              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
+                  target_points, i);
           source_ref_target(i, j) = ArborX::Point{
               src[0] - tgt[0],
               src[1] - tgt[1],
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index fb7262fe5..c1cc293c7 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -100,7 +100,7 @@ int main(int argc, char *argv[])
 
   // Create the transform from a point cloud to another
   MLS<float, MVPolynomialBasis_3D, RBFWendland_0, MemorySpace> mls(
-      space, mpi_comm, source_points, target_points);
+      mpi_comm, space, source_points, target_points);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/mpi_comms.hpp
index f32eddb9a..e39a22f99 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/mpi_comms.hpp
@@ -20,7 +20,6 @@
 #include <memory>
 #include <optional>
 
-#include "common.hpp"
 #include <mpi.h>
 
 template <typename MemorySpace>
@@ -30,7 +29,7 @@ class MPIComms
   MPIComms() = default;
 
   template <typename ExecutionSpace>
-  MPIComms(ExecutionSpace const &space, MPI_Comm comm,
+  MPIComms(MPI_Comm comm, ExecutionSpace const &space,
            Kokkos::View<int *, MemorySpace> indices,
            Kokkos::View<int *, MemorySpace> ranks)
   {
@@ -119,15 +118,20 @@ class MPIComms
   }
 
   template <typename ExecutionSpace, typename Values>
-  Kokkos::View<Details::inner_value_t<Values> *, MemorySpace>
+  Kokkos::View<typename ArborX::Details::AccessTraitsHelper<
+                   ArborX::AccessTraits<Values, ArborX::PrimitivesTag>>::type *,
+               MemorySpace>
   distributeArborX(ExecutionSpace const &space, Values const &source)
   {
-    using value_t = Details::inner_value_t<Values>;
+    using value_t = typename ArborX::Details::AccessTraitsHelper<
+        ArborX::AccessTraits<Values, ArborX::PrimitivesTag>>::type;
     static_assert(
         KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(KokkosExt::is_accessible_from<
-                  typename Details::access<Values>::memory_space,
-                  ExecutionSpace>::value);
+    static_assert(
+        KokkosExt::is_accessible_from<
+            typename ArborX::AccessTraits<Values,
+                                          ArborX::PrimitivesTag>::memory_space,
+            ExecutionSpace>::value);
     ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{}, source);
 
     assert(_distributor_back.has_value());
@@ -142,7 +146,8 @@ class MPIComms
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
         KOKKOS_CLASS_LAMBDA(int const i) {
           data_to_send(i) =
-              Details::access<Values>::get(source, _mpi_send_indices(i));
+              ArborX::AccessTraits<Values, ArborX::PrimitivesTag>::get(
+                  source, _mpi_send_indices(i));
         });
 
     return distribute(space, data_to_send);

From daf9822ffe222c09156b5ca3a6c6e662aabbe87c Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Thu, 17 Aug 2023 15:19:27 -0400
Subject: [PATCH 35/44] Better symmetric pseudo inverse (free function and
 better template)

---
 .../DetailsSymmetricPseudoInverseSVD.hpp      | 209 ++++++++++++++++
 .../moving_least_squares/mls_computation.hpp  |   6 +-
 .../symmetric_pseudoinverse_svd.hpp           | 234 ------------------
 3 files changed, 211 insertions(+), 238 deletions(-)
 create mode 100644 examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
 delete mode 100644 examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp

diff --git a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
new file mode 100644
index 000000000..1ed19be09
--- /dev/null
+++ b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
@@ -0,0 +1,209 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <limits>
+
+namespace Details
+{
+
+// This finds the biggest off-diagonal value of E.S as well as its
+// coordinates. Being symmetric, we can always check on the upper
+// triangle (and always have q > p)
+template <typename Matrices>
+KOKKOS_FUNCTION typename Matrices::non_const_value_type
+spisvdArgmaxOffDiagonal(Matrices const &es, int const i, int &p, int &q)
+{
+  using value_t = typename Matrices::non_const_value_type;
+
+  std::size_t const size = es.extent(1);
+  value_t max = 0;
+  p = q = 0;
+
+  for (int j = 0; j < size; j++)
+  {
+    for (int k = j + 1; k < size; k++)
+    {
+      value_t val = Kokkos::abs(es(i, j, k));
+      if (max < val)
+      {
+        max = val;
+        p = j;
+        q = k;
+      }
+    }
+  }
+
+  return max;
+}
+
+// Pseudo-inverse of symmetric matrices using SVD
+// We must find U, E (diagonal and positive) and V such that A = U.E.V^T
+// We also know that A is symmetric (by construction), so U = SV where S is
+// a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
+// Thus A = U.E.S.U^T and A^-1 = U.[ E^-1.S ].U^T
+template <typename ExecutionSpace, typename Matrices>
+Kokkos::View<typename Matrices::non_const_value_type ***,
+             typename Matrices::memory_space>
+symmetricPseudoInverseSVD(ExecutionSpace const &space, Matrices const &mats)
+{
+  using value_t = typename Matrices::non_const_value_type;
+  using memory_space = typename Matrices::memory_space;
+
+  std::size_t const num_matrices = mats.extent(0);
+  std::size_t const size = mats.extent(1);
+  constexpr value_t epsilon = std::numeric_limits<value_t>::epsilon();
+  constexpr value_t pi_4 = value_t(M_PI_4);
+
+  // ==> Initialisation
+  // E.S is the input matrix
+  // U is the identity
+  Kokkos::View<value_t ***, memory_space> es(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::ES"),
+      mats.layout());
+  Kokkos::View<value_t ***, memory_space> u(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::U"),
+      mats.layout());
+  Kokkos::parallel_for(
+      "Example::SPISVD::ES_U_init",
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                             {num_matrices, size, size}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        es(i, j, k) = value_t(mats(i, j, k));
+        u(i, j, k) = value_t((j == k));
+      });
+
+  // ==> Loop
+  // Iterative approach, we will "deconstruct" E.S until only the diagonal
+  // is relevent inside the matrix
+  // It is possible to prove that, at each step, the "norm" of the matrix
+  // is strictly less that of the previous
+  // For all the loops, the following equality holds: A = U.E.S.U^T
+  Kokkos::parallel_for(
+      "Example::SPISVD::compute_ES_U",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, num_matrices),
+      KOKKOS_LAMBDA(int const i) {
+        int p, q;
+        value_t norm = spisvdArgmaxOffDiagonal(es, i, p, q);
+        while (norm > epsilon)
+        {
+          value_t a = es(i, p, p);
+          value_t b = es(i, p, q);
+          value_t c = es(i, q, q);
+
+          // Our submatrix is now
+          // +----------+----------+   +---+---+
+          // | es(p, p) | es(p, q) |   | a | b |
+          // +----------+----------+ = +---+---+
+          // | es(q, p) | es(q, q) |   | b | c |
+          // +----------+----------+   +---+---+
+
+          // Lets compute x, y and theta such that
+          // +---+---+              +---+---+
+          // | a | b |              | x | 0 |
+          // +---+---+ = R(theta) * +---+---+ * R(theta)^T
+          // | b | c |              | 0 | y |
+          // +---+---+              +---+---+
+
+          value_t theta, x, y;
+          if (a == c) // <-- better to check if |a - c| < epsilon?
+          {
+            theta = pi_4;
+            x = a + b;
+            y = a - b;
+          }
+          else
+          {
+            theta = Kokkos::atan((2 * b) / (a - c)) / 2;
+            value_t a_c_cos2 = (a - c) / Kokkos::cos(2 * theta);
+            x = (a + c + a_c_cos2) / 2;
+            y = (a + c - a_c_cos2) / 2;
+          }
+          value_t cos = Kokkos::cos(theta);
+          value_t sin = Kokkos::sin(theta);
+
+          // Now lets compute the following new values for U amd E.S
+          // E.S <- R'(theta)^T . E.S . R'(theta)
+          // U  <- U . R'(theta)
+
+          // R'(theta)^T . E.S
+          for (int j = 0; j < size; j++)
+          {
+            value_t es_ipj = es(i, p, j);
+            value_t es_iqj = es(i, q, j);
+            es(i, p, j) = cos * es_ipj + sin * es_iqj;
+            es(i, q, j) = -sin * es_ipj + cos * es_iqj;
+          }
+
+          // [R'(theta)^T . E.S] . R'(theta)
+          for (int j = 0; j < size; j++)
+          {
+            value_t es_ijp = es(i, j, p);
+            value_t es_ijq = es(i, j, q);
+            es(i, j, p) = cos * es_ijp + sin * es_ijq;
+            es(i, j, q) = -sin * es_ijp + cos * es_ijq;
+          }
+
+          // U . R'(theta)
+          for (int j = 0; j < size; j++)
+          {
+            value_t u_ijp = u(i, j, p);
+            value_t u_ijq = u(i, j, q);
+            u(i, j, p) = cos * u_ijp + sin * u_ijq;
+            u(i, j, q) = -sin * u_ijp + cos * u_ijq;
+          }
+
+          // These should theorically hold but is it ok to force them to their
+          // real value?
+          es(i, p, p) = x;
+          es(i, q, q) = y;
+          es(i, p, q) = 0;
+          es(i, q, p) = 0;
+
+          norm = spisvdArgmaxOffDiagonal(es, i, p, q);
+        }
+      });
+
+  // ==> Output
+  // U and E.S are computed, we can now build the inverse
+  // U.[ E^-1.S ].U^T
+  Kokkos::View<value_t ***, memory_space> inv(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SPISVD::inv"),
+      mats.layout());
+  Kokkos::parallel_for(
+      "Example::SPISVD::inv_fill",
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
+                                             {num_matrices, size, size}),
+      KOKKOS_LAMBDA(int const i, int const j, int const k) {
+        value_t value = 0;
+        for (int l = 0; l < size; l++)
+        {
+          value_t v = es(i, l, l);
+          if (Kokkos::abs(v) > epsilon)
+          {
+            value += u(i, j, l) * u(i, k, l) / v;
+          }
+        }
+
+        inv(i, j, k) = value;
+      });
+
+  return inv;
+}
+
+} // namespace Details
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
index 6734d2f1b..d79ba0be9 100644
--- a/examples/moving_least_squares/mls_computation.hpp
+++ b/examples/moving_least_squares/mls_computation.hpp
@@ -18,7 +18,7 @@
 
 #include <cassert>
 
-#include "symmetric_pseudoinverse_svd.hpp"
+#include "DetailsSymmetricPseudoInverseSVD.hpp"
 
 template <typename ValueType, typename PolynomialBasis, typename RBF,
           typename MemorySpace>
@@ -61,9 +61,7 @@ class MLSComputation
     auto p = computeVandermonde(space, source_ref_target);
 
     auto a = computeMoment(space, phi, p);
-    auto a_inv =
-        SymmPseudoInverseSVD<ValueType, MemorySpace>::computePseudoInverses(
-            space, a);
+    auto a_inv = Details::symmetricPseudoInverseSVD(space, a);
 
     computeCoefficients(space, phi, p, a_inv);
   }
diff --git a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp b/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
deleted file mode 100644
index e901f660c..000000000
--- a/examples/moving_least_squares/symmetric_pseudoinverse_svd.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/****************************************************************************
- * Copyright (c) 2023 by the ArborX authors                                 *
- * All rights reserved.                                                     *
- *                                                                          *
- * This file is part of the ArborX library. ArborX is                       *
- * distributed under a BSD 3-clause license. For the licensing terms see    *
- * the LICENSE file in the top-level directory.                             *
- *                                                                          *
- * SPDX-License-Identifier: BSD-3-Clause                                    *
- ****************************************************************************/
-
-#pragma once
-
-#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
-
-#include <Kokkos_Core.hpp>
-
-#include <cassert>
-#include <cmath>
-#include <limits>
-
-// Pseudo-inverse moment matrix using SVD
-// We must find U, E (diagonal and positive) and V such that A = U.E.V^T
-// We also know that A is symmetric (by construction), so U = SV where S is
-// a sign matrix (only 1 or -1 in the diagonal, 0 elsewhere).
-// Thus A = U.E.S.U^T
-template <class ValueType, typename MemorySpace>
-class SymmPseudoInverseSVD
-{
-public:
-  template <typename ExecutionSpace>
-  static Kokkos::View<ValueType ***, MemorySpace>
-  computePseudoInverses(ExecutionSpace const &space,
-                        Kokkos::View<ValueType ***, MemorySpace> const &mats)
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-
-    SymmPseudoInverseSVD spis(space, mats);
-
-    // Iterative approach, we will "deconstruct" E.S until only the diagonal
-    // is relevent inside the matrix
-    // It is possible to prove that, at each step, the "norm" of the matrix
-    // is strictly less that of the previous
-    Kokkos::parallel_for(
-        "Example::SVD::compute_U_ES",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, spis._num_matrices),
-        KOKKOS_LAMBDA(int const i) {
-          int p, q;
-          ValueType norm = spis.argmaxOffDiagonal(i, p, q);
-          while (norm > spis._epsilon)
-          {
-            spis.computeUESSingle(i, p, q);
-            norm = spis.argmaxOffDiagonal(i, p, q);
-          }
-        });
-
-    // From the SVD results, the pseudo inverse would be
-    // U . [ E^-1.S ] . U^T
-    Kokkos::parallel_for(
-        "Example::SVD::fill_inv",
-        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
-            space, {0, 0, 0}, {spis._num_matrices, spis._size, spis._size}),
-        KOKKOS_LAMBDA(int const i, int const j, int const k) {
-          spis.fillInv(i, j, k);
-        });
-
-    return spis._inv;
-  }
-
-private:
-  // U and E.S are computed, we can now build the inverse
-  // U . [ E^-1.S ] . U^T
-  KOKKOS_FUNCTION void fillInv(int const i, int const j, int const k) const
-  {
-    ValueType value = _zero;
-    for (int l = 0; l < _size; l++)
-    {
-      ValueType v = _es(i, l, l);
-      if (Kokkos::abs(v) > _epsilon)
-      {
-        value += _u(i, j, l) * _u(i, k, l) / v;
-      }
-    }
-
-    _inv(i, j, k) = value;
-  }
-
-  // We found the biggest value in our off-diagonal. We will remove it by
-  // computing a "local" svd and update U and E.S
-  KOKKOS_FUNCTION void computeUESSingle(int const i, int const p,
-                                        int const q) const
-  {
-    ValueType a = _es(i, p, p);
-    ValueType b = _es(i, p, q);
-    ValueType c = _es(i, q, q);
-
-    // Our submatrix is now
-    // +----------+----------+   +---+---+
-    // | es(p, p) | es(p, q) |   | a | b |
-    // +----------+----------+ = +---+---+
-    // | es(q, p) | es(q, q) |   | b | c |
-    // +----------+----------+   +---+---+
-
-    // Lets compute u, v and theta such that
-    // +---+---+              +---+---+
-    // | a | b |              | u | 0 |
-    // +---+---+ = R(theta) * +---+---+ * R(theta)^T
-    // | b | c |              | 0 | v |
-    // +---+---+              +---+---+
-
-    ValueType theta, u, v;
-    if (a == c) // <-- better to check if |a - c| < epsilon?
-    {
-      theta = _pi_4;
-      u = a + b;
-      v = a - b;
-    }
-    else
-    {
-      theta = _half * Kokkos::atan((_two * b) / (a - c));
-      ValueType a_c_cos2 = (a - c) / Kokkos::cos(_two * theta);
-      u = _half * (a + c + a_c_cos2);
-      v = _half * (a + c - a_c_cos2);
-    }
-    ValueType cos = Kokkos::cos(theta);
-    ValueType sin = Kokkos::sin(theta);
-
-    // Now lets compute the following new values for U amd E.S
-    // E.S <- R'(theta)^T . E.S . R'(theta)
-    // U  <- U . R'(theta)
-
-    // R'(theta)^T . E.S
-    for (int j = 0; j < _size; j++)
-    {
-      ValueType es_ipj = _es(i, p, j);
-      ValueType es_iqj = _es(i, q, j);
-      _es(i, p, j) = cos * es_ipj + sin * es_iqj;
-      _es(i, q, j) = -sin * es_ipj + cos * es_iqj;
-    }
-
-    // [R'(theta)^T . E.S] . R'(theta)
-    for (int j = 0; j < _size; j++)
-    {
-      ValueType es_ijp = _es(i, j, p);
-      ValueType es_ijq = _es(i, j, q);
-      _es(i, j, p) = cos * es_ijp + sin * es_ijq;
-      _es(i, j, q) = -sin * es_ijp + cos * es_ijq;
-    }
-
-    // U . R'(theta)
-    for (int j = 0; j < _size; j++)
-    {
-      ValueType u_ijp = _u(i, j, p);
-      ValueType u_ijq = _u(i, j, q);
-      _u(i, j, p) = cos * u_ijp + sin * u_ijq;
-      _u(i, j, q) = -sin * u_ijp + cos * u_ijq;
-    }
-
-    // These should theorically hold but is it ok to force them to their
-    // real value?
-    _es(i, p, p) = u;
-    _es(i, q, q) = v;
-    _es(i, p, q) = _zero;
-    _es(i, q, p) = _zero;
-  }
-
-  // This finds the biggest off-diagonal value of E.S as well as its
-  // coordinates. Being symmetric, we can always check on the upper
-  // triangle (and always have q > p)
-  KOKKOS_FUNCTION ValueType argmaxOffDiagonal(int const i, int &p, int &q) const
-  {
-    ValueType max = _zero;
-    p = q = 0;
-    for (int j = 0; j < _size; j++)
-    {
-      for (int k = j + 1; k < _size; k++)
-      {
-        ValueType val = Kokkos::abs(_es(i, j, k));
-        if (max < val)
-        {
-          max = val;
-          p = j;
-          q = k;
-        }
-      }
-    }
-
-    return max;
-  }
-
-  template <typename ExecutionSpace>
-  SymmPseudoInverseSVD(ExecutionSpace const &space,
-                       Kokkos::View<ValueType ***, MemorySpace> const &mats)
-      : _num_matrices(mats.extent(0))
-      , _size(mats.extent(1))
-  {
-    // mats must be an array of (symmetric) square matrices
-    assert(mats.extent(1) == mats.extent(2));
-
-    _es = Kokkos::View<ValueType ***, MemorySpace>(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::ES"),
-        mats.layout());
-    Kokkos::deep_copy(space, _es, mats);
-
-    _u = Kokkos::View<ValueType ***, MemorySpace>(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::U"),
-        mats.layout());
-    Kokkos::parallel_for(
-        "Example::SVD::U_init",
-        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                               {_num_matrices, _size, _size}),
-        KOKKOS_LAMBDA(int const i, int const j, int const k) {
-          _u(i, j, k) = ValueType((j == k));
-        });
-
-    _inv = Kokkos::View<ValueType ***, MemorySpace>(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::SVD::inv"),
-        mats.layout());
-  }
-
-  Kokkos::View<ValueType ***, MemorySpace> _es;
-  Kokkos::View<ValueType ***, MemorySpace> _u;
-  Kokkos::View<ValueType ***, MemorySpace> _inv;
-  std::size_t _num_matrices;
-  std::size_t _size;
-
-  static constexpr ValueType _pi_4 = ValueType(M_PI_4);
-  static constexpr ValueType _epsilon =
-      std::numeric_limits<ValueType>::epsilon();
-  static constexpr ValueType _half = ValueType(0.5);
-  static constexpr ValueType _two = ValueType(2);
-  static constexpr ValueType _zero = ValueType(0);
-};
\ No newline at end of file

From 0c6af8134fcef2c308dfb6c5bd01025cb20bdbed Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 18 Aug 2023 08:53:28 -0400
Subject: [PATCH 36/44] Better MLS computations, more permissive templates

---
 .../DetailsMovingLeastSquaresComputation.hpp  | 206 ++++++++++++++
 .../DetailsSymmetricPseudoInverseSVD.hpp      |   2 -
 examples/moving_least_squares/mls.hpp         |   8 +-
 .../moving_least_squares/mls_computation.hpp  | 268 ------------------
 4 files changed, 210 insertions(+), 274 deletions(-)
 create mode 100644 examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
 delete mode 100644 examples/moving_least_squares/mls_computation.hpp

diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
new file mode 100644
index 000000000..b789b4351
--- /dev/null
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -0,0 +1,206 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <cassert>
+
+#include "DetailsSymmetricPseudoInverseSVD.hpp"
+
+namespace Details
+{
+
+template <typename CoefficientType, typename MemorySpace>
+class MovingLeastSquaresComputation
+{
+public:
+  MovingLeastSquaresComputation() = default;
+
+  template <typename ExecutionSpace, typename PolynomialBasis,
+            typename RadialBasisFunction, typename SourcePoints,
+            typename TargetPoints>
+  MovingLeastSquaresComputation(ExecutionSpace const &space,
+                                SourcePoints const &source_points,
+                                TargetPoints const &target_points,
+                                PolynomialBasis const &,
+                                RadialBasisFunction const &)
+  {
+    using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
+    using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+
+    _num_targets = tgt_acc::size(target_points);
+    _num_neighbors = src_acc::size(source_points) / _num_targets;
+    constexpr CoefficientType epsilon =
+        std::numeric_limits<CoefficientType>::epsilon();
+    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
+
+    // We center each group of points around the target as it ables us to
+    // optimize the final computation and transfer point types into ours
+    // TODO: Use multidimensional points!
+    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::source_ref_target"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLSC::source_ref_target_fill",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          auto src = src_acc::get(source_points, i * _num_neighbors + j);
+          auto tgt = tgt_acc::get(target_points, i);
+          source_ref_target(i, j) = ArborX::Point{
+              src[0] - tgt[0],
+              src[1] - tgt[1],
+              src[2] - tgt[2],
+          };
+        });
+
+    // To properly use the RBF, we need to decide for a radius around each
+    // target point that encapsulates all of the points
+    Kokkos::View<CoefficientType *, MemorySpace> radii(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
+        _num_targets);
+    Kokkos::parallel_for(
+        "Example::MLSC::radii_computation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          CoefficientType radius = 10 * epsilon;
+
+          for (int j = 0; j < _num_neighbors; j++)
+          {
+            CoefficientType norm =
+                ArborX::Details::distance(source_ref_target(i, j), origin);
+            radius = (radius < norm) ? norm : radius;
+          }
+
+          // The one at the limit would be valued at 0 due to how RBF works
+          radii(i) = 1.1 * radius;
+        });
+
+    // Once the radius is computed, the wieght follows by evaluating the RBF at
+    // each source point with their proper radii
+    Kokkos::View<CoefficientType **, MemorySpace> phi(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLSC::phi_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          CoefficientType norm =
+              ArborX::Details::distance(source_ref_target(i, j), origin);
+          phi(i, j) = RadialBasisFunction::apply(norm / radii(i));
+        });
+
+    // We then need to create the Vandermonde matrix for each source point
+    // Instead of relying on an external type, could it be produced
+    // automatically?
+    Kokkos::View<CoefficientType ***, MemorySpace> p(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::vandermonde"),
+        _num_targets, _num_neighbors, PolynomialBasis::size);
+    Kokkos::parallel_for(
+        "Example::MLSC::vandermonde_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          auto basis = PolynomialBasis::basis(source_ref_target(i, j));
+
+          for (int k = 0; k < PolynomialBasis::size; k++)
+          {
+            p(i, j, k) = basis[k];
+          }
+        });
+
+    // From the weight and Vandermonde matrices, we can compute the moment
+    // matrix as A = P^T.PHI.P
+    Kokkos::View<CoefficientType ***, MemorySpace> a(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::moment"),
+        _num_targets, PolynomialBasis::size, PolynomialBasis::size);
+    Kokkos::parallel_for(
+        "Example::MLSC::moment_computation",
+        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
+            space, {0, 0, 0},
+            {_num_targets, PolynomialBasis::size, PolynomialBasis::size}),
+        KOKKOS_LAMBDA(int const i, int const j, int const k) {
+          CoefficientType tmp = 0;
+
+          for (int l = 0; l < _num_neighbors; l++)
+          {
+            tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
+          }
+
+          a(i, j, k) = tmp;
+        });
+
+    // We then take the pseudo-inverse of that moment matrix.
+    auto a_inv = symmetricPseudoInverseSVD(space, a);
+
+    // We finally build the coefficients as C = [1 0 0 ...].A^-1.P^T.PHI
+    _coeffs = Kokkos::View<CoefficientType **, MemorySpace>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::coefficients"),
+        _num_targets, _num_neighbors);
+    Kokkos::parallel_for(
+        "Example::MLSC::coefficients",
+        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
+                                               {_num_targets, _num_neighbors}),
+        KOKKOS_LAMBDA(int const i, int const j) {
+          CoefficientType tmp = 0;
+
+          for (int k = 0; k < PolynomialBasis::size; k++)
+          {
+            tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
+          }
+
+          _coeffs(i, j) = tmp;
+        });
+  }
+
+  template <typename ExecutionSpace, typename SourceValues>
+  Kokkos::View<typename SourceValues::non_const_value_type *,
+               typename SourceValues::memory_space>
+  apply(ExecutionSpace const &space, SourceValues const &source_values)
+  {
+    using value_t = typename SourceValues::non_const_value_type;
+    using memory_space = typename SourceValues::memory_space;
+
+    Kokkos::View<value_t *, memory_space> target_values(
+        "Example::MLSC::target_values", _num_targets);
+    Kokkos::parallel_for(
+        "Example::MLSC::target_interpolation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          value_t tmp = 0;
+
+          for (int j = 0; j < _num_neighbors; j++)
+          {
+            tmp += _coeffs(i, j) * source_values(i * _num_neighbors + j);
+          }
+
+          target_values(i) = tmp;
+        });
+
+    return target_values;
+  }
+
+private:
+  Kokkos::View<CoefficientType **, MemorySpace> _coeffs;
+  std::size_t _num_targets;
+  std::size_t _num_neighbors;
+};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
index 1ed19be09..7d697a435 100644
--- a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
+++ b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
@@ -11,8 +11,6 @@
 
 #pragma once
 
-#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
-
 #include <Kokkos_Core.hpp>
 
 #include <cassert>
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index 0566515cc..d213056c2 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -18,7 +18,7 @@
 
 #include <cassert>
 
-#include "mls_computation.hpp"
+#include "DetailsMovingLeastSquaresComputation.hpp"
 #include "mpi_comms.hpp"
 
 template <typename Points>
@@ -113,8 +113,8 @@ class MLS
     auto local_source_points = _comms.distributeArborX(space, source_points);
 
     // Compute the internal MLS
-    _mlsc = MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace>(
-        space, local_source_points, target_points);
+    _mlsc = Details::MovingLeastSquaresComputation<ValueType, MemorySpace>(
+        space, local_source_points, target_points, PolynomialBasis{}, RBF{});
   }
 
   template <typename ExecutionSpace>
@@ -129,7 +129,7 @@ class MLS
   }
 
 private:
-  MLSComputation<ValueType, PolynomialBasis, RBF, MemorySpace> _mlsc;
+  Details::MovingLeastSquaresComputation<ValueType, MemorySpace> _mlsc;
   MPIComms<MemorySpace> _comms;
   std::size_t _num_neighbors;
   std::size_t _src_size;
diff --git a/examples/moving_least_squares/mls_computation.hpp b/examples/moving_least_squares/mls_computation.hpp
deleted file mode 100644
index d79ba0be9..000000000
--- a/examples/moving_least_squares/mls_computation.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/****************************************************************************
- * Copyright (c) 2023 by the ArborX authors                                 *
- * All rights reserved.                                                     *
- *                                                                          *
- * This file is part of the ArborX library. ArborX is                       *
- * distributed under a BSD 3-clause license. For the licensing terms see    *
- * the LICENSE file in the top-level directory.                             *
- *                                                                          *
- * SPDX-License-Identifier: BSD-3-Clause                                    *
- ****************************************************************************/
-
-#pragma once
-
-#include <ArborX.hpp>
-#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
-
-#include <Kokkos_Core.hpp>
-
-#include <cassert>
-
-#include "DetailsSymmetricPseudoInverseSVD.hpp"
-
-template <typename ValueType, typename PolynomialBasis, typename RBF,
-          typename MemorySpace>
-class MLSComputation
-{
-public:
-  MLSComputation() = default;
-
-  template <typename ExecutionSpace, typename Points>
-  MLSComputation(
-      ExecutionSpace const &space,
-      Kokkos::View<
-          typename ArborX::Details::AccessTraitsHelper<
-              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type *,
-          MemorySpace> const &source_points,
-      Points const &target_points)
-      : _num_targets(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            target_points))
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(
-        KokkosExt::is_accessible_from<
-            typename ArborX::AccessTraits<Points,
-                                          ArborX::PrimitivesTag>::memory_space,
-            ExecutionSpace>::value);
-    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
-                                               target_points);
-
-    // There must be a list of num_neighbors source points for each
-    // target point
-    _num_neighbors = source_points.extent(0) / _num_targets;
-    assert(source_points.extent(0) == _num_targets * _num_neighbors);
-
-    auto source_ref_target =
-        translateToTarget(space, source_points, target_points);
-
-    auto radii = computeRadii(space, source_ref_target);
-    auto phi = computeWeight(space, source_ref_target, radii);
-    auto p = computeVandermonde(space, source_ref_target);
-
-    auto a = computeMoment(space, phi, p);
-    auto a_inv = Details::symmetricPseudoInverseSVD(space, a);
-
-    computeCoefficients(space, phi, p, a_inv);
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType *>
-  apply(ExecutionSpace const &space,
-        Kokkos::View<ValueType *, MemorySpace> const &source_values)
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    assert(source_values.extent(0) == _num_targets * _num_neighbors);
-
-    Kokkos::View<ValueType *, MemorySpace> target_values(
-        "Example::MLSC::target_values", _num_targets);
-    Kokkos::parallel_for(
-        "Example::MLSC::target_interpolation",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
-        KOKKOS_LAMBDA(int const i) {
-          ValueType tmp = _zero;
-          for (int j = 0; j < _num_neighbors; j++)
-          {
-            tmp += _coeffs(i, j) * source_values(i * _num_neighbors + j);
-          }
-          target_values(i) = tmp;
-        });
-
-    return target_values;
-  }
-
-private:
-  template <typename ExecutionSpace, typename Points>
-  Kokkos::View<ArborX::Point **, MemorySpace> translateToTarget(
-      ExecutionSpace const &space,
-      Kokkos::View<
-          typename ArborX::Details::AccessTraitsHelper<
-              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type *,
-          MemorySpace> const &source_points,
-      Points const &target_points)
-  {
-    using point_t = typename ArborX::Details::AccessTraitsHelper<
-        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type;
-
-    // We center each group around the target as it ables you to
-    // optimize the final computation
-    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLSC::source_ref_target"),
-        _num_targets, _num_neighbors);
-    Kokkos::parallel_for(
-        "Example::MLSC::source_ref_target_fill",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
-        KOKKOS_LAMBDA(int const i, int const j) {
-          point_t src = source_points(i * _num_neighbors + j);
-          point_t tgt =
-              ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
-                  target_points, i);
-          source_ref_target(i, j) = ArborX::Point{
-              src[0] - tgt[0],
-              src[1] - tgt[1],
-              src[2] - tgt[2],
-          };
-        });
-
-    return source_ref_target;
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType *, MemorySpace> computeRadii(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
-  {
-    Kokkos::View<ValueType *, MemorySpace> radii(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
-        _num_targets);
-    Kokkos::parallel_for(
-        "Example::MLSC::radii_computation",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
-        KOKKOS_LAMBDA(int const i) {
-          ValueType radius = _ten * _epsilon;
-          for (int j = 0; j < _num_neighbors; j++)
-          {
-            ValueType norm =
-                ArborX::Details::distance(source_ref_target(i, j), _origin);
-            radius = (radius < norm) ? norm : radius;
-          }
-          radii(i) = _one_extra * radius;
-        });
-
-    return radii;
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType **, MemorySpace> computeWeight(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
-      Kokkos::View<ValueType *, MemorySpace> const &radii)
-  {
-    Kokkos::View<ValueType **, MemorySpace> phi(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
-        _num_targets, _num_neighbors);
-    Kokkos::parallel_for(
-        "Example::MLSC::phi_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
-        KOKKOS_LAMBDA(int const i, int const j) {
-          ValueType norm =
-              ArborX::Details::distance(source_ref_target(i, j), _origin);
-          phi(i, j) = RBF::apply(norm / radii(i));
-        });
-
-    return phi;
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType ***, MemorySpace> computeVandermonde(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target)
-  {
-    // Instead of relying on an external type, could it be produced
-    // automatically?
-    Kokkos::View<ValueType ***, MemorySpace> p(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLSC::vandermonde"),
-        _num_targets, _num_neighbors, PolynomialBasis::size);
-    Kokkos::parallel_for(
-        "Example::MLSC::vandermonde_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
-        KOKKOS_LAMBDA(int const i, int const j) {
-          auto basis = PolynomialBasis::basis(source_ref_target(i, j));
-          for (int k = 0; k < PolynomialBasis::size; k++)
-          {
-            p(i, j, k) = basis[k];
-          }
-        });
-
-    return p;
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType ***, MemorySpace>
-  computeMoment(ExecutionSpace const &space,
-                Kokkos::View<ValueType **, MemorySpace> const &phi,
-                Kokkos::View<ValueType ***, MemorySpace> const &p)
-  {
-    Kokkos::View<ValueType ***, MemorySpace> a(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLSC::moment"),
-        _num_targets, PolynomialBasis::size, PolynomialBasis::size);
-    Kokkos::parallel_for(
-        "Example::MLSC::moment_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
-            space, {0, 0, 0},
-            {_num_targets, PolynomialBasis::size, PolynomialBasis::size}),
-        KOKKOS_LAMBDA(int const i, int const j, int const k) {
-          ValueType tmp = _zero;
-          for (int l = 0; l < _num_neighbors; l++)
-          {
-            tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
-          }
-          a(i, j, k) = tmp;
-        });
-
-    return a;
-  }
-
-  template <typename ExecutionSpace>
-  void
-  computeCoefficients(ExecutionSpace const &space,
-                      Kokkos::View<ValueType **, MemorySpace> const &phi,
-                      Kokkos::View<ValueType ***, MemorySpace> const &p,
-                      Kokkos::View<ValueType ***, MemorySpace> const &a_inv)
-  {
-    _coeffs = Kokkos::View<ValueType **, MemorySpace>(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLSC::coefficients"),
-        _num_targets, _num_neighbors);
-    Kokkos::parallel_for(
-        "Example::MLSC::coefficients",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
-        KOKKOS_LAMBDA(int const i, int const j) {
-          ValueType tmp = _zero;
-          for (int k = 0; k < PolynomialBasis::size; k++)
-          {
-            tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
-          }
-          _coeffs(i, j) = tmp;
-        });
-  }
-
-  Kokkos::View<ValueType **, MemorySpace> _coeffs;
-  std::size_t _num_targets;
-  std::size_t _num_neighbors;
-
-  static constexpr ValueType _zero = ValueType(0);
-  static constexpr ValueType _ten = ValueType(10);
-  static constexpr ValueType _epsilon =
-      std::numeric_limits<ValueType>::epsilon();
-  static constexpr ValueType _one_extra = ValueType(1.1);
-  static constexpr ArborX::Point _origin = ArborX::Point{0, 0, 0};
-};
\ No newline at end of file

From a1bd2917467c22a0da89d4568f44a7b3abb62b01 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 18 Aug 2023 09:44:32 -0400
Subject: [PATCH 37/44] Proper usage of AccessTraits for MPI comms

---
 ... DetailsDistributedTreePostQueryComms.hpp} | 151 ++++++++----------
 .../DetailsMovingLeastSquaresComputation.hpp  |   2 -
 .../DetailsSymmetricPseudoInverseSVD.hpp      |   1 -
 examples/moving_least_squares/mls.hpp         |  29 +---
 4 files changed, 69 insertions(+), 114 deletions(-)
 rename examples/moving_least_squares/{mpi_comms.hpp => DetailsDistributedTreePostQueryComms.hpp} (61%)

diff --git a/examples/moving_least_squares/mpi_comms.hpp b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
similarity index 61%
rename from examples/moving_least_squares/mpi_comms.hpp
rename to examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
index e39a22f99..6d56aceea 100644
--- a/examples/moving_least_squares/mpi_comms.hpp
+++ b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
@@ -12,31 +12,28 @@
 #pragma once
 
 #include <ArborX.hpp>
-#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
 
 #include <Kokkos_Core.hpp>
 
-#include <cassert>
 #include <memory>
 #include <optional>
 
 #include <mpi.h>
 
+namespace Details
+{
+
 template <typename MemorySpace>
-class MPIComms
+class DistributedTreePostQueryComms
 {
 public:
-  MPIComms() = default;
+  DistributedTreePostQueryComms() = default;
 
-  template <typename ExecutionSpace>
-  MPIComms(MPI_Comm comm, ExecutionSpace const &space,
-           Kokkos::View<int *, MemorySpace> indices,
-           Kokkos::View<int *, MemorySpace> ranks)
+  template <typename ExecutionSpace, typename IndicesAndRanks>
+  DistributedTreePostQueryComms(MPI_Comm comm, ExecutionSpace const &space,
+                                IndicesAndRanks const &indices_and_ranks)
   {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    assert(indices.extent(0) == ranks.extent(0));
-    std::size_t data_len = indices.extent(0);
+    std::size_t data_len = indices_and_ranks.extent(0);
 
     _comm.reset(
         [comm]() {
@@ -56,8 +53,25 @@ class MPIComms
     MPI_Comm_rank(*_comm, &rank);
 
     Kokkos::View<int *, MemorySpace> mpi_tmp(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MPI::tmp"),
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::DTPQC::tmp"),
+        data_len);
+
+    // Split indices/ranks
+    Kokkos::View<int *, MemorySpace> indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::indices"),
         data_len);
+    Kokkos::View<int *, MemorySpace> ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::ranks"),
+        data_len);
+    Kokkos::parallel_for(
+        "Example::DTPQC::indices_and_ranks_split",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, data_len),
+        KOKKOS_LAMBDA(int const i) {
+          indices(i) = indices_and_ranks(i).index;
+          ranks(i) = indices_and_ranks(i).rank;
+        });
 
     // Computes what will be common to every exchange. Every time
     // someone wants to get the value from the same set of elements,
@@ -75,7 +89,7 @@ class MPIComms
     // array that rebuilds the output
     Kokkos::View<int *, MemorySpace> mpi_rev_indices(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::rev_indices"),
+                           "Example::DTPQC::rev_indices"),
         _num_requests);
     ArborX::iota(space, mpi_tmp);
     ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
@@ -85,7 +99,7 @@ class MPIComms
     // the process owning the source
     _mpi_send_indices = Kokkos::View<int *, MemorySpace>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::send_indices"),
+                           "Example::DTPQC::send_indices"),
         _num_requests);
     ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
         space, distributor_forth, indices, _mpi_send_indices);
@@ -94,117 +108,68 @@ class MPIComms
     // distributor to dispatch the values
     Kokkos::View<int *, MemorySpace> mpi_rev_ranks(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::rev_ranks"),
+                           "Example::DTPQC::rev_ranks"),
         _num_requests);
     Kokkos::deep_copy(space, mpi_tmp, rank);
     ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
         space, distributor_forth, mpi_tmp, mpi_rev_ranks);
 
     // This will create the reverse of the previous distributor
-    _distributor_back = ArborX::Details::Distributor<MemorySpace>(*_comm);
-    _num_responses = _distributor_back->createFromSends(space, mpi_rev_ranks);
+    _distributor = ArborX::Details::Distributor<MemorySpace>(*_comm);
+    _num_responses = _distributor->createFromSends(space, mpi_rev_ranks);
 
     // There should be enough responses to perfectly fill what was requested
-    assert(_num_responses == data_len);
+    // i.e. _num_responses == data_len
 
     // The we send back the requested indices so that each process can rebuild
-    // the output
+    // their output
     _mpi_recv_indices = Kokkos::View<int *, MemorySpace>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::recv_indices"),
+                           "Example::DTPQC::recv_indices"),
         _num_responses);
     ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
-        space, *_distributor_back, mpi_rev_indices, _mpi_recv_indices);
+        space, *_distributor, mpi_rev_indices, _mpi_recv_indices);
   }
 
   template <typename ExecutionSpace, typename Values>
   Kokkos::View<typename ArborX::Details::AccessTraitsHelper<
                    ArborX::AccessTraits<Values, ArborX::PrimitivesTag>>::type *,
-               MemorySpace>
-  distributeArborX(ExecutionSpace const &space, Values const &source)
+               typename ArborX::AccessTraits<
+                   Values, ArborX::PrimitivesTag>::memory_space>
+  distribute(ExecutionSpace const &space, Values const &source)
   {
-    using value_t = typename ArborX::Details::AccessTraitsHelper<
-        ArborX::AccessTraits<Values, ArborX::PrimitivesTag>>::type;
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(
-        KokkosExt::is_accessible_from<
-            typename ArborX::AccessTraits<Values,
-                                          ArborX::PrimitivesTag>::memory_space,
-            ExecutionSpace>::value);
-    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{}, source);
-
-    assert(_distributor_back.has_value());
+    using src_acc = ArborX::AccessTraits<Values, ArborX::PrimitivesTag>;
+    using value_t = typename ArborX::Details::AccessTraitsHelper<src_acc>::type;
+    using memory_space = typename src_acc::memory_space;
 
     // We know what each process want so we prepare the data to be sent
     Kokkos::View<value_t *, MemorySpace> data_to_send(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::data_to_send"),
-        _num_requests);
-    Kokkos::parallel_for(
-        "Example::MPI::data_to_send_fill",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
-        KOKKOS_CLASS_LAMBDA(int const i) {
-          data_to_send(i) =
-              ArborX::AccessTraits<Values, ArborX::PrimitivesTag>::get(
-                  source, _mpi_send_indices(i));
-        });
-
-    return distribute(space, data_to_send);
-  }
-
-  template <typename ExecutionSpace, typename ValueType>
-  Kokkos::View<ValueType *, MemorySpace>
-  distributeView(ExecutionSpace const &space,
-                 Kokkos::View<ValueType *, MemorySpace> const &source)
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    assert(_distributor_back.has_value());
-
-    // We know what each process want so we prepare the data to be sent
-    Kokkos::View<ValueType *, MemorySpace> data_to_send(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::data_to_send"),
+                           "Example::DTPQC::data_to_send"),
         _num_requests);
     Kokkos::parallel_for(
-        "Example::MPI::data_to_send_fill",
+        "Example::DTPQC::data_to_send_fill",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_requests),
         KOKKOS_CLASS_LAMBDA(int const i) {
-          data_to_send(i) = source(_mpi_send_indices(i));
+          data_to_send(i) = src_acc::get(source, _mpi_send_indices(i));
         });
 
-    return distribute(space, data_to_send);
-  }
-
-private:
-  std::shared_ptr<MPI_Comm> _comm;
-  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
-  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
-  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor_back;
-  std::size_t _num_requests;
-  std::size_t _num_responses;
-
-  template <typename ExecutionSpace, typename ValueType>
-  Kokkos::View<ValueType *, MemorySpace>
-  distribute(ExecutionSpace const &space,
-             Kokkos::View<ValueType *, MemorySpace> const &data_to_send)
-  {
     // We properly send the data, and each process has what it wants, but in the
     // wrong order
-    Kokkos::View<ValueType *, MemorySpace> data_to_recv(
+    Kokkos::View<value_t *, MemorySpace> data_to_recv(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MPI::data_to_recv"),
+                           "Example::DTPQC::data_to_recv"),
         _num_responses);
     ArborX::Details::DistributedTreeImpl<MemorySpace>::sendAcrossNetwork(
-        space, *_distributor_back, data_to_send, data_to_recv);
+        space, *_distributor, data_to_send, data_to_recv);
 
     // So we fix this by moving everything
-    Kokkos::View<ValueType *, MemorySpace> output(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MPI::output"),
+    Kokkos::View<value_t *, memory_space> output(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::output"),
         _num_responses);
     Kokkos::parallel_for(
-        "Example::MPI::output_fill",
+        "Example::DTPQC::output_fill",
         Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_responses),
         KOKKOS_CLASS_LAMBDA(int const i) {
           output(_mpi_recv_indices(i)) = data_to_recv(i);
@@ -212,4 +177,14 @@ class MPIComms
 
     return output;
   }
-};
\ No newline at end of file
+
+private:
+  std::shared_ptr<MPI_Comm> _comm;
+  Kokkos::View<int *, MemorySpace> _mpi_send_indices;
+  Kokkos::View<int *, MemorySpace> _mpi_recv_indices;
+  std::optional<ArborX::Details::Distributor<MemorySpace>> _distributor;
+  std::size_t _num_requests;
+  std::size_t _num_responses;
+};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index b789b4351..0d089297f 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -15,8 +15,6 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <cassert>
-
 #include "DetailsSymmetricPseudoInverseSVD.hpp"
 
 namespace Details
diff --git a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
index 7d697a435..985ab5bf2 100644
--- a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
+++ b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
@@ -13,7 +13,6 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <cassert>
 #include <cmath>
 #include <limits>
 
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
index d213056c2..9f5560dd7 100644
--- a/examples/moving_least_squares/mls.hpp
+++ b/examples/moving_least_squares/mls.hpp
@@ -18,8 +18,8 @@
 
 #include <cassert>
 
+#include "DetailsDistributedTreePostQueryComms.hpp"
 #include "DetailsMovingLeastSquaresComputation.hpp"
-#include "mpi_comms.hpp"
 
 template <typename Points>
 struct TargetPoints
@@ -90,27 +90,10 @@ class MLS
                       TargetPoints<Points>{target_points, _num_neighbors},
                       index_ranks, offsets);
 
-    // Split indices/ranks
-    Kokkos::View<int *, MemorySpace> local_indices(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLS::local_indices"),
-        _tgt_size * _num_neighbors);
-    Kokkos::View<int *, MemorySpace> local_ranks(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::MLS::local_ranks"),
-        _tgt_size * _num_neighbors);
-    Kokkos::parallel_for(
-        "Example::MLS::index_ranks_split",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0,
-                                            _tgt_size * _num_neighbors),
-        KOKKOS_LAMBDA(int const i) {
-          local_indices(i) = index_ranks(i).index;
-          local_ranks(i) = index_ranks(i).rank;
-        });
-
     // Set up comms and local source points
-    _comms = MPIComms<MemorySpace>(comm, space, local_indices, local_ranks);
-    auto local_source_points = _comms.distributeArborX(space, source_points);
+    _comms = Details::DistributedTreePostQueryComms<MemorySpace>(comm, space,
+                                                                 index_ranks);
+    auto local_source_points = _comms.distribute(space, source_points);
 
     // Compute the internal MLS
     _mlsc = Details::MovingLeastSquaresComputation<ValueType, MemorySpace>(
@@ -125,12 +108,12 @@ class MLS
     static_assert(
         KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
     assert(source_values.extent(0) == _src_size);
-    return _mlsc.apply(space, _comms.distributeView(space, source_values));
+    return _mlsc.apply(space, _comms.distribute(space, source_values));
   }
 
 private:
   Details::MovingLeastSquaresComputation<ValueType, MemorySpace> _mlsc;
-  MPIComms<MemorySpace> _comms;
+  Details::DistributedTreePostQueryComms<MemorySpace> _comms;
   std::size_t _num_neighbors;
   std::size_t _src_size;
   std::size_t _tgt_size;

From a7650db0c379775c9e0cdf963ce779ddcce46f5a Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 18 Aug 2023 10:24:54 -0400
Subject: [PATCH 38/44] Proper MLS public interface

---
 .../DetailsMovingLeastSquaresComputation.hpp  |   2 +-
 .../MovingLeastSquares.hpp                    | 108 ++++++++++++++++
 examples/moving_least_squares/mls.hpp         | 120 ------------------
 .../moving_least_squares.cpp                  |   9 +-
 4 files changed, 115 insertions(+), 124 deletions(-)
 create mode 100644 examples/moving_least_squares/MovingLeastSquares.hpp
 delete mode 100644 examples/moving_least_squares/mls.hpp

diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index 0d089297f..ebe5951f0 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -20,7 +20,7 @@
 namespace Details
 {
 
-template <typename CoefficientType, typename MemorySpace>
+template <typename MemorySpace, typename CoefficientType>
 class MovingLeastSquaresComputation
 {
 public:
diff --git a/examples/moving_least_squares/MovingLeastSquares.hpp b/examples/moving_least_squares/MovingLeastSquares.hpp
new file mode 100644
index 000000000..fe78e8bce
--- /dev/null
+++ b/examples/moving_least_squares/MovingLeastSquares.hpp
@@ -0,0 +1,108 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include "DetailsDistributedTreePostQueryComms.hpp"
+#include "DetailsMovingLeastSquaresComputation.hpp"
+
+namespace Details
+{
+
+// This is done to avoid clashing with another predicate access trait
+template <typename Points>
+struct TargetPointsPredicateWrapper
+{
+  Points target_points;
+  std::size_t num_neighbors;
+};
+
+} // namespace Details
+
+template <typename Points>
+struct ArborX::AccessTraits<Details::TargetPointsPredicateWrapper<Points>,
+                            ArborX::PredicatesTag>
+{
+  static KOKKOS_FUNCTION std::size_t
+  size(::Details::TargetPointsPredicateWrapper<Points> const &tp)
+  {
+    return ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
+        tp.target_points);
+  }
+
+  static KOKKOS_FUNCTION auto
+  get(::Details::TargetPointsPredicateWrapper<Points> const &tp, std::size_t i)
+  {
+    return ArborX::nearest(
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
+            tp.target_points, i),
+        tp.num_neighbors);
+  }
+
+  using memory_space =
+      typename ArborX::AccessTraits<Points,
+                                    ArborX::PrimitivesTag>::memory_space;
+};
+
+// Public interface to compute the moving least squares approximation between a
+// souce and target point cloud
+template <typename MemorySpace, typename FloatingCalculationType = float>
+class MovingLeastSquares
+{
+public:
+  template <typename ExecustionSpace, typename PolynomialBasis,
+            typename RadialBasisFunction, typename SourcePoints,
+            typename TargetPoints>
+  MovingLeastSquares(MPI_Comm comm, ExecustionSpace const &space,
+                     SourcePoints const &source_points,
+                     TargetPoints const &target_points,
+                     PolynomialBasis const &pb, RadialBasisFunction const &rbf,
+                     std::size_t num_neighbors = PolynomialBasis::size)
+  {
+    // Organize the source points as a tree and create the predicates
+    ArborX::DistributedTree<MemorySpace> source_tree(comm, space,
+                                                     source_points);
+    Details::TargetPointsPredicateWrapper<TargetPoints> predicates{
+        target_points, num_neighbors};
+
+    // Makes the NN query
+    Kokkos::View<ArborX::PairIndexRank *, MemorySpace> indices_and_ranks(
+        "Example::MLS::indices_and_ranks", 0);
+    Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
+    source_tree.query(space, predicates, indices_and_ranks, offsets);
+
+    // Set up comms and collect the points for a local MLS
+    _comms = Details::DistributedTreePostQueryComms<MemorySpace>(
+        comm, space, indices_and_ranks);
+    auto local_source_points = _comms.distribute(space, source_points);
+
+    // Finally, compute the local MLS for the local target points
+    _mlsc = Details::MovingLeastSquaresComputation<MemorySpace,
+                                                   FloatingCalculationType>(
+        space, local_source_points, target_points, pb, rbf);
+  }
+
+  template <typename ExecutionSpace, typename SourceValues>
+  auto apply(ExecutionSpace const &space, SourceValues const &source_values)
+  {
+    // Distribute and compute the result
+    return _mlsc.apply(space, _comms.distribute(space, source_values));
+  }
+
+private:
+  Details::MovingLeastSquaresComputation<MemorySpace, FloatingCalculationType>
+      _mlsc;
+  Details::DistributedTreePostQueryComms<MemorySpace> _comms;
+};
\ No newline at end of file
diff --git a/examples/moving_least_squares/mls.hpp b/examples/moving_least_squares/mls.hpp
deleted file mode 100644
index 9f5560dd7..000000000
--- a/examples/moving_least_squares/mls.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/****************************************************************************
- * Copyright (c) 2023 by the ArborX authors                                 *
- * All rights reserved.                                                     *
- *                                                                          *
- * This file is part of the ArborX library. ArborX is                       *
- * distributed under a BSD 3-clause license. For the licensing terms see    *
- * the LICENSE file in the top-level directory.                             *
- *                                                                          *
- * SPDX-License-Identifier: BSD-3-Clause                                    *
- ****************************************************************************/
-
-#pragma once
-
-#include <ArborX.hpp>
-#include <ArborX_DetailsKokkosExtAccessibilityTraits.hpp>
-
-#include <Kokkos_Core.hpp>
-
-#include <cassert>
-
-#include "DetailsDistributedTreePostQueryComms.hpp"
-#include "DetailsMovingLeastSquaresComputation.hpp"
-
-template <typename Points>
-struct TargetPoints
-{
-  Points target_points;
-  std::size_t num_neighbors;
-};
-
-template <typename Points>
-struct ArborX::AccessTraits<TargetPoints<Points>, ArborX::PredicatesTag>
-{
-  static KOKKOS_FUNCTION std::size_t size(TargetPoints<Points> const &tp)
-  {
-    return ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-        tp.target_points);
-  }
-
-  static KOKKOS_FUNCTION auto get(TargetPoints<Points> const &tp, std::size_t i)
-  {
-    return ArborX::nearest(
-        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::get(
-            tp.target_points, i),
-        tp.num_neighbors);
-  }
-
-  using memory_space =
-      typename ArborX::AccessTraits<Points,
-                                    ArborX::PrimitivesTag>::memory_space;
-};
-
-template <typename ValueType, typename PolynomialBasis, typename RBF,
-          typename MemorySpace>
-class MLS
-{
-public:
-  template <typename ExecutionSpace, typename Points>
-  MLS(MPI_Comm comm, ExecutionSpace const &space, Points const &source_points,
-      Points const &target_points,
-      std::size_t num_neighbors = PolynomialBasis::size)
-      : _num_neighbors(num_neighbors)
-      , _src_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            source_points))
-      , _tgt_size(ArborX::AccessTraits<Points, ArborX::PrimitivesTag>::size(
-            target_points))
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    static_assert(
-        KokkosExt::is_accessible_from<
-            typename ArborX::AccessTraits<Points,
-                                          ArborX::PrimitivesTag>::memory_space,
-            ExecutionSpace>::value);
-    ArborX::Details::check_valid_access_traits(ArborX::PrimitivesTag{},
-                                               source_points);
-
-    // A minimum nuber of source points are needed
-    assert(_src_size >= _num_neighbors);
-
-    // Organize source points as tree
-    ArborX::DistributedTree<MemorySpace> source_tree(comm, space,
-                                                     source_points);
-
-    // Perform the query
-    Kokkos::View<ArborX::PairIndexRank *, MemorySpace> index_ranks(
-        "Example::MLS::index_ranks", 0);
-    Kokkos::View<int *, MemorySpace> offsets("Example::MLS::offsets", 0);
-    source_tree.query(space,
-                      TargetPoints<Points>{target_points, _num_neighbors},
-                      index_ranks, offsets);
-
-    // Set up comms and local source points
-    _comms = Details::DistributedTreePostQueryComms<MemorySpace>(comm, space,
-                                                                 index_ranks);
-    auto local_source_points = _comms.distribute(space, source_points);
-
-    // Compute the internal MLS
-    _mlsc = Details::MovingLeastSquaresComputation<ValueType, MemorySpace>(
-        space, local_source_points, target_points, PolynomialBasis{}, RBF{});
-  }
-
-  template <typename ExecutionSpace>
-  Kokkos::View<ValueType *, MemorySpace>
-  apply(ExecutionSpace const &space,
-        Kokkos::View<ValueType *, MemorySpace> const &source_values)
-  {
-    static_assert(
-        KokkosExt::is_accessible_from<MemorySpace, ExecutionSpace>::value);
-    assert(source_values.extent(0) == _src_size);
-    return _mlsc.apply(space, _comms.distribute(space, source_values));
-  }
-
-private:
-  Details::MovingLeastSquaresComputation<ValueType, MemorySpace> _mlsc;
-  Details::DistributedTreePostQueryComms<MemorySpace> _comms;
-  std::size_t _num_neighbors;
-  std::size_t _src_size;
-  std::size_t _tgt_size;
-};
\ No newline at end of file
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index c1cc293c7..f7969731b 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -13,6 +13,8 @@
 // (https://github.com/ORNL-CEES/DataTransferKit)
 // with MLS resolution from
 // (http://dx.doi.org/10.1016/j.jcp.2015.11.055)
+// and
+// (A conservative mesh-free approach for fluid-structure interface problems)
 
 #include <ArborX.hpp>
 
@@ -20,7 +22,7 @@
 
 #include <sstream>
 
-#include "mls.hpp"
+#include "MovingLeastSquares.hpp"
 #include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
@@ -99,8 +101,9 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Create the transform from a point cloud to another
-  MLS<float, MVPolynomialBasis_3D, RBFWendland_0, MemorySpace> mls(
-      mpi_comm, space, source_points, target_points);
+  MovingLeastSquares<MemorySpace, float> mls(
+      mpi_comm, space, source_points, target_points, MVPolynomialBasis_3D{},
+      RBFWendland_0{});
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",

From 8fc8a75bf0033341c386b2113bd69622c99810af Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Fri, 18 Aug 2023 11:24:50 -0400
Subject: [PATCH 39/44] Extra RBFs

---
 .../DetailsRadialBasisFunctions.hpp           | 73 +++++++++++++++++++
 .../MovingLeastSquares.hpp                    |  4 +-
 .../moving_least_squares.cpp                  | 14 +---
 3 files changed, 79 insertions(+), 12 deletions(-)
 create mode 100644 examples/moving_least_squares/DetailsRadialBasisFunctions.hpp

diff --git a/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
new file mode 100644
index 000000000..9d0d43551
--- /dev/null
+++ b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
@@ -0,0 +1,73 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <Kokkos_Core.hpp>
+
+#include <cmath>
+
+#define RBF_DECL(name)                                                         \
+  template <int K>                                                             \
+  struct name
+
+#define RBF_DEF(name, n, func)                                                 \
+  template <>                                                                  \
+  struct name<n>                                                               \
+  {                                                                            \
+    template <typename T>                                                      \
+    KOKKOS_INLINE_FUNCTION static T apply(T x)                                 \
+    {                                                                          \
+      return func;                                                             \
+    }                                                                          \
+  }
+
+namespace Details
+{
+
+RBF_DECL(Wendland);
+RBF_DEF(Wendland, 0, (1 - x) * (1 - x));
+RBF_DEF(Wendland, 2, (1 - x) * (1 - x) * (1 - x) * (1 - x) * (4 * x + 1));
+RBF_DEF(Wendland, 4,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (35 * x * x + 18 * x + 3));
+RBF_DEF(Wendland, 6,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (1 - x) * (32 * x * x * x + 25 * x * x + 8 * x + 1));
+
+RBF_DECL(Wu);
+RBF_DEF(Wu, 2,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (3 * x * x * x + 12 * x + 16 * x + 4));
+RBF_DEF(Wu, 4,
+        (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
+            (5 * x * x * x * x * x + 30 * x * x * x * x + 72 * x * x * x +
+             82 * x * x + 36 * x + 6));
+
+RBF_DECL(Buhmann);
+RBF_DEF(Buhmann, 2,
+        2 * x * x * x * x * log(x) - T(7) / 2 * x * x * x * x +
+            T(16) / 3 * x * x * x - 2 * x * x + T(1) / 6);
+RBF_DEF(Buhmann, 3,
+        1 * x * x * x * x * x * x * x * x - T(84) / 5 * x * x * x * x * x * x +
+            T(1024) / 5 * x * x * x * x * sqrt(x) - 378 * x * x * x * x +
+            T(1024) / 5 * x * x * x * sqrt(x) - T(84) / 5 * x * x + 1);
+RBF_DEF(Buhmann, 4,
+        T(99) / 35 * x * x * x * x * x * x * x * x -
+            132 * x * x * x * x * x * x +
+            T(9216) / 35 * x * x * x * x * x * sqrt(x) -
+            T(11264) / 35 * x * x * x * x * sqrt(x) + 198 * x * x * x * x -
+            T(396) / 5 * x * x + 1);
+
+} // namespace Details
+
+#undef RBF_DECL
+#undef RBF_DEF
\ No newline at end of file
diff --git a/examples/moving_least_squares/MovingLeastSquares.hpp b/examples/moving_least_squares/MovingLeastSquares.hpp
index fe78e8bce..ce1382cd0 100644
--- a/examples/moving_least_squares/MovingLeastSquares.hpp
+++ b/examples/moving_least_squares/MovingLeastSquares.hpp
@@ -62,10 +62,10 @@ template <typename MemorySpace, typename FloatingCalculationType = float>
 class MovingLeastSquares
 {
 public:
-  template <typename ExecustionSpace, typename PolynomialBasis,
+  template <typename ExecutionSpace, typename PolynomialBasis,
             typename RadialBasisFunction, typename SourcePoints,
             typename TargetPoints>
-  MovingLeastSquares(MPI_Comm comm, ExecustionSpace const &space,
+  MovingLeastSquares(MPI_Comm comm, ExecutionSpace const &space,
                      SourcePoints const &source_points,
                      TargetPoints const &target_points,
                      PolynomialBasis const &pb, RadialBasisFunction const &rbf,
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index f7969731b..139e1230d 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -20,22 +20,16 @@
 
 #include <Kokkos_Core.hpp>
 
+#include <limits>
 #include <sstream>
 
+#include "DetailsRadialBasisFunctions.hpp"
 #include "MovingLeastSquares.hpp"
 #include <mpi.h>
 
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
 
-struct RBFWendland_0
-{
-  KOKKOS_INLINE_FUNCTION static float apply(float x)
-  {
-    return (1.f - x) * (1.f - x);
-  }
-};
-
 struct MVPolynomialBasis_3D
 {
   static constexpr std::size_t size = 10;
@@ -51,7 +45,7 @@ struct MVPolynomialBasis_3D
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
-  return Kokkos::sin(p[0]) * p[2] + p[1];
+  return p[2] + p[1];
 }
 
 int main(int argc, char *argv[])
@@ -103,7 +97,7 @@ int main(int argc, char *argv[])
   // Create the transform from a point cloud to another
   MovingLeastSquares<MemorySpace, float> mls(
       mpi_comm, space, source_points, target_points, MVPolynomialBasis_3D{},
-      RBFWendland_0{});
+      Details::Wendland<0>{});
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",

From f3556deb07f131fc458e65d1b3439df24bcfd6ae Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Mon, 21 Aug 2023 10:22:36 -0400
Subject: [PATCH 40/44] NVCC and CUDA compliance (compilation errors)

---
 .../DetailsDistributedTreePostQueryComms.hpp  |  45 ++--
 .../DetailsMovingLeastSquaresComputation.hpp  | 232 +++++++++++++-----
 .../DetailsSymmetricPseudoInverseSVD.hpp      |   8 +-
 .../moving_least_squares.cpp                  |   4 +-
 4 files changed, 202 insertions(+), 87 deletions(-)

diff --git a/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
index 6d56aceea..6c4e7ec34 100644
--- a/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
+++ b/examples/moving_least_squares/DetailsDistributedTreePostQueryComms.hpp
@@ -57,21 +57,10 @@ class DistributedTreePostQueryComms
         data_len);
 
     // Split indices/ranks
-    Kokkos::View<int *, MemorySpace> indices(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::DTPQC::indices"),
-        data_len);
-    Kokkos::View<int *, MemorySpace> ranks(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::DTPQC::ranks"),
-        data_len);
-    Kokkos::parallel_for(
-        "Example::DTPQC::indices_and_ranks_split",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, data_len),
-        KOKKOS_LAMBDA(int const i) {
-          indices(i) = indices_and_ranks(i).index;
-          ranks(i) = indices_and_ranks(i).rank;
-        });
+    Kokkos::Array<Kokkos::View<int *, MemorySpace>, 2> split_indices_ranks =
+        indicesAndRanksSplit(space, indices_and_ranks, data_len);
+    Kokkos::View<int *, MemorySpace> indices = split_indices_ranks[0];
+    Kokkos::View<int *, MemorySpace> ranks = split_indices_ranks[1];
 
     // Computes what will be common to every exchange. Every time
     // someone wants to get the value from the same set of elements,
@@ -178,6 +167,32 @@ class DistributedTreePostQueryComms
     return output;
   }
 
+  template <typename ExecutionSpace, typename IndicesAndRanks>
+  static Kokkos::Array<Kokkos::View<int *, MemorySpace>, 2>
+  indicesAndRanksSplit(ExecutionSpace const &space,
+                       IndicesAndRanks const &indices_and_ranks,
+                       std::size_t data_len)
+  {
+    Kokkos::View<int *, MemorySpace> indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::indices"),
+        data_len);
+    Kokkos::View<int *, MemorySpace> ranks(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::DTPQC::ranks"),
+        data_len);
+
+    Kokkos::parallel_for(
+        "Example::DTPQC::indices_and_ranks_split",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, data_len),
+        KOKKOS_LAMBDA(int const i) {
+          indices(i) = indices_and_ranks(i).index;
+          ranks(i) = indices_and_ranks(i).rank;
+        });
+
+    return {{indices, ranks}};
+  }
+
 private:
   std::shared_ptr<MPI_Comm> _comm;
   Kokkos::View<int *, MemorySpace> _mpi_send_indices;
diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index ebe5951f0..6d37ff690 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -32,31 +32,105 @@ class MovingLeastSquaresComputation
   MovingLeastSquaresComputation(ExecutionSpace const &space,
                                 SourcePoints const &source_points,
                                 TargetPoints const &target_points,
-                                PolynomialBasis const &,
-                                RadialBasisFunction const &)
+                                PolynomialBasis const &pb,
+                                RadialBasisFunction const &rbf)
   {
     using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
     using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
 
     _num_targets = tgt_acc::size(target_points);
     _num_neighbors = src_acc::size(source_points) / _num_targets;
-    constexpr CoefficientType epsilon =
-        std::numeric_limits<CoefficientType>::epsilon();
-    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
 
     // We center each group of points around the target as it ables us to
     // optimize the final computation and transfer point types into ours
     // TODO: Use multidimensional points!
+    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target =
+        sourceRefTargetFill(space, source_points, target_points, _num_targets,
+                            _num_neighbors);
+
+    // To properly use the RBF, we need to decide for a radius around each
+    // target point that encapsulates all of the points
+    Kokkos::View<CoefficientType *, MemorySpace> radii = radiiComputation(
+        space, source_ref_target, _num_targets, _num_neighbors);
+
+    // Once the radius is computed, the wieght follows by evaluating the RBF at
+    // each source point with their proper radii
+    Kokkos::View<CoefficientType **, MemorySpace> phi = weightComputation(
+        space, source_ref_target, radii, _num_targets, _num_neighbors, rbf);
+
+    // We then need to create the Vandermonde matrix for each source point
+    // Instead of relying on an external type, could it be produced
+    // automatically?
+    Kokkos::View<CoefficientType ***, MemorySpace> p = vandermondeComputation(
+        space, source_ref_target, _num_targets, _num_neighbors, pb);
+
+    // From the weight and Vandermonde matrices, we can compute the moment
+    // matrix as A = P^T.PHI.P
+    Kokkos::View<CoefficientType ***, MemorySpace> a =
+        momentComputation(space, phi, p, _num_targets, _num_neighbors, pb);
+
+    // We then take the pseudo-inverse of that moment matrix.
+    Kokkos::View<CoefficientType ***, MemorySpace> a_inv =
+        symmetricPseudoInverseSVD(space, a);
+
+    // We finally build the coefficients as C = [1 0 0 ...].A^-1.P^T.PHI
+    _coeffs = coefficientsComputation(space, phi, p, a_inv, _num_targets,
+                                      _num_neighbors, pb);
+  }
+
+  template <typename ExecutionSpace, typename SourceValues>
+  Kokkos::View<typename SourceValues::non_const_value_type *,
+               typename SourceValues::memory_space>
+  apply(ExecutionSpace const &space, SourceValues const &source_values)
+  {
+    using value_t = typename SourceValues::non_const_value_type;
+    using memory_space = typename SourceValues::memory_space;
+
+    std::size_t num_neighbors = _num_neighbors;
+    Kokkos::View<CoefficientType **, MemorySpace> coeffs = _coeffs;
+
+    Kokkos::View<value_t *, memory_space> target_values(
+        "Example::MLSC::target_values", _num_targets);
+
+    Kokkos::parallel_for(
+        "Example::MLSC::target_interpolation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        KOKKOS_LAMBDA(int const i) {
+          value_t tmp = 0;
+
+          for (int j = 0; j < num_neighbors; j++)
+          {
+            tmp += coeffs(i, j) * source_values(i * num_neighbors + j);
+          }
+
+          target_values(i) = tmp;
+        });
+
+    return target_values;
+  }
+
+  template <typename ExecutionSpace, typename SourcePoints,
+            typename TargetPoints>
+  static Kokkos::View<ArborX::Point **, MemorySpace>
+  sourceRefTargetFill(ExecutionSpace const &space,
+                      SourcePoints const &source_points,
+                      TargetPoints const &target_points,
+                      std::size_t num_targets, std::size_t num_neighbors)
+  {
+    using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
+    using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+
     Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::source_ref_target"),
-        _num_targets, _num_neighbors);
+        num_targets, num_neighbors);
+
     Kokkos::parallel_for(
         "Example::MLSC::source_ref_target_fill",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
-          auto src = src_acc::get(source_points, i * _num_neighbors + j);
+          auto src = src_acc::get(source_points, i * num_neighbors + j);
           auto tgt = tgt_acc::get(target_points, i);
           source_ref_target(i, j) = ArborX::Point{
               src[0] - tgt[0],
@@ -65,18 +139,30 @@ class MovingLeastSquaresComputation
           };
         });
 
-    // To properly use the RBF, we need to decide for a radius around each
-    // target point that encapsulates all of the points
+    return source_ref_target;
+  }
+
+  template <typename ExecutionSpace>
+  static Kokkos::View<CoefficientType *, MemorySpace> radiiComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      std::size_t num_targets, std::size_t num_neighbors)
+  {
+    constexpr CoefficientType epsilon =
+        std::numeric_limits<CoefficientType>::epsilon();
+    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
+
     Kokkos::View<CoefficientType *, MemorySpace> radii(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
-        _num_targets);
+        num_targets);
+
     Kokkos::parallel_for(
         "Example::MLSC::radii_computation",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, num_targets),
         KOKKOS_LAMBDA(int const i) {
           CoefficientType radius = 10 * epsilon;
 
-          for (int j = 0; j < _num_neighbors; j++)
+          for (int j = 0; j < num_neighbors; j++)
           {
             CoefficientType norm =
                 ArborX::Details::distance(source_ref_target(i, j), origin);
@@ -87,32 +173,52 @@ class MovingLeastSquaresComputation
           radii(i) = 1.1 * radius;
         });
 
-    // Once the radius is computed, the wieght follows by evaluating the RBF at
-    // each source point with their proper radii
+    return radii;
+  }
+
+  template <typename ExecutionSpace, typename RadialBasisFunction>
+  static Kokkos::View<CoefficientType **, MemorySpace> weightComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      Kokkos::View<CoefficientType *, MemorySpace> const &radii,
+      std::size_t num_targets, std::size_t num_neighbors,
+      RadialBasisFunction const &)
+  {
+    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
+
     Kokkos::View<CoefficientType **, MemorySpace> phi(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
-        _num_targets, _num_neighbors);
+        num_targets, num_neighbors);
+
     Kokkos::parallel_for(
         "Example::MLSC::phi_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
           CoefficientType norm =
               ArborX::Details::distance(source_ref_target(i, j), origin);
           phi(i, j) = RadialBasisFunction::apply(norm / radii(i));
         });
 
-    // We then need to create the Vandermonde matrix for each source point
-    // Instead of relying on an external type, could it be produced
-    // automatically?
+    return phi;
+  }
+
+  template <typename ExecutionSpace, typename PolynomialBasis>
+  static Kokkos::View<CoefficientType ***, MemorySpace> vandermondeComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      std::size_t num_targets, std::size_t num_neighbors,
+      PolynomialBasis const &)
+  {
     Kokkos::View<CoefficientType ***, MemorySpace> p(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::vandermonde"),
-        _num_targets, _num_neighbors, PolynomialBasis::size);
+        num_targets, num_neighbors, PolynomialBasis::size);
+
     Kokkos::parallel_for(
         "Example::MLSC::vandermonde_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
           auto basis = PolynomialBasis::basis(source_ref_target(i, j));
 
@@ -122,21 +228,31 @@ class MovingLeastSquaresComputation
           }
         });
 
-    // From the weight and Vandermonde matrices, we can compute the moment
-    // matrix as A = P^T.PHI.P
+    return p;
+  }
+
+  template <typename ExecutionSpace, typename PolynomialBasis>
+  static Kokkos::View<CoefficientType ***, MemorySpace>
+  momentComputation(ExecutionSpace const &space,
+                    Kokkos::View<CoefficientType **, MemorySpace> const &phi,
+                    Kokkos::View<CoefficientType ***, MemorySpace> const &p,
+                    std::size_t num_targets, std::size_t num_neighbors,
+                    PolynomialBasis const &)
+  {
     Kokkos::View<CoefficientType ***, MemorySpace> a(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::moment"),
-        _num_targets, PolynomialBasis::size, PolynomialBasis::size);
+        num_targets, PolynomialBasis::size, PolynomialBasis::size);
+
     Kokkos::parallel_for(
         "Example::MLSC::moment_computation",
-        Kokkos::MDRangePolicy<Kokkos::Rank<3>>(
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
             space, {0, 0, 0},
-            {_num_targets, PolynomialBasis::size, PolynomialBasis::size}),
+            {num_targets, PolynomialBasis::size, PolynomialBasis::size}),
         KOKKOS_LAMBDA(int const i, int const j, int const k) {
           CoefficientType tmp = 0;
 
-          for (int l = 0; l < _num_neighbors; l++)
+          for (int l = 0; l < num_neighbors; l++)
           {
             tmp += p(i, l, j) * p(i, l, k) * phi(i, l);
           }
@@ -144,18 +260,27 @@ class MovingLeastSquaresComputation
           a(i, j, k) = tmp;
         });
 
-    // We then take the pseudo-inverse of that moment matrix.
-    auto a_inv = symmetricPseudoInverseSVD(space, a);
+    return a;
+  }
 
-    // We finally build the coefficients as C = [1 0 0 ...].A^-1.P^T.PHI
-    _coeffs = Kokkos::View<CoefficientType **, MemorySpace>(
+  template <typename ExecutionSpace, typename PolynomialBasis>
+  static Kokkos::View<CoefficientType **, MemorySpace> coefficientsComputation(
+      ExecutionSpace const &space,
+      Kokkos::View<CoefficientType **, MemorySpace> const &phi,
+      Kokkos::View<CoefficientType ***, MemorySpace> const &p,
+      Kokkos::View<CoefficientType ***, MemorySpace> const &a_inv,
+      std::size_t num_targets, std::size_t num_neighbors,
+      PolynomialBasis const &)
+  {
+    Kokkos::View<CoefficientType **, MemorySpace> coeffs(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::coefficients"),
-        _num_targets, _num_neighbors);
+        num_targets, num_neighbors);
+
     Kokkos::parallel_for(
-        "Example::MLSC::coefficients",
-        Kokkos::MDRangePolicy<Kokkos::Rank<2>>(space, {0, 0},
-                                               {_num_targets, _num_neighbors}),
+        "Example::MLSC::coefficients_computation",
+        Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
+            space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
           CoefficientType tmp = 0;
 
@@ -164,35 +289,10 @@ class MovingLeastSquaresComputation
             tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
           }
 
-          _coeffs(i, j) = tmp;
+          coeffs(i, j) = tmp;
         });
-  }
 
-  template <typename ExecutionSpace, typename SourceValues>
-  Kokkos::View<typename SourceValues::non_const_value_type *,
-               typename SourceValues::memory_space>
-  apply(ExecutionSpace const &space, SourceValues const &source_values)
-  {
-    using value_t = typename SourceValues::non_const_value_type;
-    using memory_space = typename SourceValues::memory_space;
-
-    Kokkos::View<value_t *, memory_space> target_values(
-        "Example::MLSC::target_values", _num_targets);
-    Kokkos::parallel_for(
-        "Example::MLSC::target_interpolation",
-        Kokkos::RangePolicy<ExecutionSpace>(space, 0, _num_targets),
-        KOKKOS_LAMBDA(int const i) {
-          value_t tmp = 0;
-
-          for (int j = 0; j < _num_neighbors; j++)
-          {
-            tmp += _coeffs(i, j) * source_values(i * _num_neighbors + j);
-          }
-
-          target_values(i) = tmp;
-        });
-
-    return target_values;
+    return coeffs;
   }
 
 private:
diff --git a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
index 985ab5bf2..6f55db2f5 100644
--- a/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
+++ b/examples/moving_least_squares/DetailsSymmetricPseudoInverseSVD.hpp
@@ -78,8 +78,8 @@ symmetricPseudoInverseSVD(ExecutionSpace const &space, Matrices const &mats)
       mats.layout());
   Kokkos::parallel_for(
       "Example::SPISVD::ES_U_init",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                             {num_matrices, size, size}),
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {num_matrices, size, size}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         es(i, j, k) = value_t(mats(i, j, k));
         u(i, j, k) = value_t((j == k));
@@ -184,8 +184,8 @@ symmetricPseudoInverseSVD(ExecutionSpace const &space, Matrices const &mats)
       mats.layout());
   Kokkos::parallel_for(
       "Example::SPISVD::inv_fill",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                             {num_matrices, size, size}),
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {num_matrices, size, size}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         value_t value = 0;
         for (int l = 0; l < size; l++)
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 139e1230d..287103f0e 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -77,8 +77,8 @@ int main(int argc, char *argv[])
   std::size_t thickness = cube_side / mpi_size;
   Kokkos::parallel_for(
       "Example::source_points_init",
-      Kokkos::MDRangePolicy<Kokkos::Rank<3>>(space, {0, 0, 0},
-                                             {cube_side, cube_side, thickness}),
+      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
+          space, {0, 0, 0}, {cube_side, cube_side, thickness}),
       KOKKOS_LAMBDA(int const i, int const j, int const k) {
         source_points(i * cube_side * thickness + j * thickness +
                       k) = ArborX::Point{

From db133c898f7f244dca81d0f459d10afe3153eba6 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 22 Aug 2023 14:59:05 -0400
Subject: [PATCH 41/44] Automatic polynomial basis generation and better rbf
 interface

---
 .../DetailsMovingLeastSquaresComputation.hpp  |  44 ++++----
 .../DetailsPolynomialBasis.hpp                | 100 ++++++++++++++++++
 .../DetailsRadialBasisFunctions.hpp           |  32 +++---
 .../MovingLeastSquares.hpp                    |  18 ++--
 .../moving_least_squares.cpp                  |  18 +---
 5 files changed, 157 insertions(+), 55 deletions(-)
 create mode 100644 examples/moving_least_squares/DetailsPolynomialBasis.hpp

diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index 6d37ff690..b16ed4cde 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -15,6 +15,7 @@
 
 #include <Kokkos_Core.hpp>
 
+#include "DetailsPolynomialBasis.hpp"
 #include "DetailsSymmetricPseudoInverseSVD.hpp"
 
 namespace Details
@@ -26,13 +27,13 @@ class MovingLeastSquaresComputation
 public:
   MovingLeastSquaresComputation() = default;
 
-  template <typename ExecutionSpace, typename PolynomialBasis,
+  template <typename ExecutionSpace, typename PolynomialDegree,
             typename RadialBasisFunction, typename SourcePoints,
             typename TargetPoints>
   MovingLeastSquaresComputation(ExecutionSpace const &space,
                                 SourcePoints const &source_points,
                                 TargetPoints const &target_points,
-                                PolynomialBasis const &pb,
+                                PolynomialDegree const &pd,
                                 RadialBasisFunction const &rbf)
   {
     using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
@@ -41,6 +42,9 @@ class MovingLeastSquaresComputation
     _num_targets = tgt_acc::size(target_points);
     _num_neighbors = src_acc::size(source_points) / _num_targets;
 
+    static constexpr std::size_t polynomialBasisSize =
+        polynomialBasisSizeFromAT<SourcePoints, PolynomialDegree::value>;
+
     // We center each group of points around the target as it ables us to
     // optimize the final computation and transfer point types into ours
     // TODO: Use multidimensional points!
@@ -62,12 +66,12 @@ class MovingLeastSquaresComputation
     // Instead of relying on an external type, could it be produced
     // automatically?
     Kokkos::View<CoefficientType ***, MemorySpace> p = vandermondeComputation(
-        space, source_ref_target, _num_targets, _num_neighbors, pb);
+        space, source_ref_target, _num_targets, _num_neighbors, pd);
 
     // From the weight and Vandermonde matrices, we can compute the moment
     // matrix as A = P^T.PHI.P
-    Kokkos::View<CoefficientType ***, MemorySpace> a =
-        momentComputation(space, phi, p, _num_targets, _num_neighbors, pb);
+    Kokkos::View<CoefficientType ***, MemorySpace> a = momentComputation(
+        space, phi, p, _num_targets, _num_neighbors, polynomialBasisSize);
 
     // We then take the pseudo-inverse of that moment matrix.
     Kokkos::View<CoefficientType ***, MemorySpace> a_inv =
@@ -75,7 +79,7 @@ class MovingLeastSquaresComputation
 
     // We finally build the coefficients as C = [1 0 0 ...].A^-1.P^T.PHI
     _coeffs = coefficientsComputation(space, phi, p, a_inv, _num_targets,
-                                      _num_neighbors, pb);
+                                      _num_neighbors, polynomialBasisSize);
   }
 
   template <typename ExecutionSpace, typename SourceValues>
@@ -203,26 +207,30 @@ class MovingLeastSquaresComputation
     return phi;
   }
 
-  template <typename ExecutionSpace, typename PolynomialBasis>
+  template <typename ExecutionSpace, typename PolynomialDegree>
   static Kokkos::View<CoefficientType ***, MemorySpace> vandermondeComputation(
       ExecutionSpace const &space,
       Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
       std::size_t num_targets, std::size_t num_neighbors,
-      PolynomialBasis const &)
+      PolynomialDegree const &)
   {
+    static constexpr std::size_t polynomialBasisSize =
+        polynomialBasisSizeFromT<ArborX::Point, PolynomialDegree::value>;
+
     Kokkos::View<CoefficientType ***, MemorySpace> p(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::vandermonde"),
-        num_targets, num_neighbors, PolynomialBasis::size);
+        num_targets, num_neighbors, polynomialBasisSize);
 
     Kokkos::parallel_for(
         "Example::MLSC::vandermonde_computation",
         Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
             space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
-          auto basis = PolynomialBasis::basis(source_ref_target(i, j));
+          auto basis = polynomialBasis<ArborX::Point, PolynomialDegree::value>(
+              source_ref_target(i, j));
 
-          for (int k = 0; k < PolynomialBasis::size; k++)
+          for (int k = 0; k < polynomialBasisSize; k++)
           {
             p(i, j, k) = basis[k];
           }
@@ -231,24 +239,24 @@ class MovingLeastSquaresComputation
     return p;
   }
 
-  template <typename ExecutionSpace, typename PolynomialBasis>
+  template <typename ExecutionSpace>
   static Kokkos::View<CoefficientType ***, MemorySpace>
   momentComputation(ExecutionSpace const &space,
                     Kokkos::View<CoefficientType **, MemorySpace> const &phi,
                     Kokkos::View<CoefficientType ***, MemorySpace> const &p,
                     std::size_t num_targets, std::size_t num_neighbors,
-                    PolynomialBasis const &)
+                    std::size_t polynomialBasisSize)
   {
     Kokkos::View<CoefficientType ***, MemorySpace> a(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::moment"),
-        num_targets, PolynomialBasis::size, PolynomialBasis::size);
+        num_targets, polynomialBasisSize, polynomialBasisSize);
 
     Kokkos::parallel_for(
         "Example::MLSC::moment_computation",
         Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
             space, {0, 0, 0},
-            {num_targets, PolynomialBasis::size, PolynomialBasis::size}),
+            {num_targets, polynomialBasisSize, polynomialBasisSize}),
         KOKKOS_LAMBDA(int const i, int const j, int const k) {
           CoefficientType tmp = 0;
 
@@ -263,14 +271,14 @@ class MovingLeastSquaresComputation
     return a;
   }
 
-  template <typename ExecutionSpace, typename PolynomialBasis>
+  template <typename ExecutionSpace>
   static Kokkos::View<CoefficientType **, MemorySpace> coefficientsComputation(
       ExecutionSpace const &space,
       Kokkos::View<CoefficientType **, MemorySpace> const &phi,
       Kokkos::View<CoefficientType ***, MemorySpace> const &p,
       Kokkos::View<CoefficientType ***, MemorySpace> const &a_inv,
       std::size_t num_targets, std::size_t num_neighbors,
-      PolynomialBasis const &)
+      std::size_t polynomialBasisSize)
   {
     Kokkos::View<CoefficientType **, MemorySpace> coeffs(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
@@ -284,7 +292,7 @@ class MovingLeastSquaresComputation
         KOKKOS_LAMBDA(int const i, int const j) {
           CoefficientType tmp = 0;
 
-          for (int k = 0; k < PolynomialBasis::size; k++)
+          for (int k = 0; k < polynomialBasisSize; k++)
           {
             tmp += a_inv(i, 0, k) * p(i, j, k) * phi(i, j);
           }
diff --git a/examples/moving_least_squares/DetailsPolynomialBasis.hpp b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
new file mode 100644
index 000000000..e4383b632
--- /dev/null
+++ b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
@@ -0,0 +1,100 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the ArborX authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the ArborX library. ArborX is                       *
+ * distributed under a BSD 3-clause license. For the licensing terms see    *
+ * the LICENSE file in the top-level directory.                             *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#pragma once
+
+#include <ArborX.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include <type_traits>
+
+namespace Details
+{
+
+template <std::size_t Dim, std::size_t Deg>
+KOKKOS_FUNCTION constexpr Kokkos::Array<Kokkos::Array<std::size_t, Dim>, Deg>
+polynomialBasisColumnSizes()
+{
+  Kokkos::Array<Kokkos::Array<std::size_t, Dim>, Deg> arr{};
+
+  for (std::size_t dim = 0; dim < Dim; dim++)
+    arr[0][dim] = 1;
+  for (std::size_t deg = 0; deg < Deg; deg++)
+    arr[deg][0] = 1;
+
+  for (std::size_t deg = 1; deg < Deg; deg++)
+    for (std::size_t dim = 1; dim < Dim; dim++)
+      arr[deg][dim] = arr[deg - 1][dim] + arr[deg][dim - 1];
+
+  return arr;
+}
+
+template <std::size_t Dim, std::size_t Deg>
+KOKKOS_FUNCTION constexpr std::size_t polynomialBasisSize()
+{
+  auto arr = polynomialBasisColumnSizes<Dim, Deg>();
+  std::size_t size = 1;
+
+  for (std::size_t deg = 0; deg < Deg; deg++)
+    for (std::size_t dim = 0; dim < Dim; dim++)
+      size += arr[deg][dim];
+
+  return size;
+}
+template <typename Point, std::size_t Deg>
+static constexpr std::size_t polynomialBasisSizeFromT =
+    polynomialBasisSize<ArborX::GeometryTraits::dimension_v<Point>, Deg>();
+
+template <typename Points, std::size_t Deg>
+static constexpr std::size_t polynomialBasisSizeFromAT =
+    polynomialBasisSizeFromT<
+        typename ArborX::Details::AccessTraitsHelper<
+            ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type,
+        Deg>;
+
+template <typename Point, std::size_t Deg>
+KOKKOS_FUNCTION auto polynomialBasis(Point const &p)
+{
+  static constexpr std::size_t dimension =
+      ArborX::GeometryTraits::dimension_v<Point>;
+  static constexpr auto column_details =
+      polynomialBasisColumnSizes<dimension, Deg>();
+  using value_t = typename ArborX::GeometryTraits::coordinate_type<Point>::type;
+
+  Kokkos::Array<value_t, polynomialBasisSize<dimension, Deg>()> arr{};
+  arr[0] = value_t(1);
+
+  std::size_t prev_col = 0;
+  std::size_t curr_col = 1;
+  for (std::size_t deg = 0; deg < Deg; deg++)
+  {
+    std::size_t loc_offset = curr_col;
+    for (std::size_t dim = 0; dim < dimension; dim++)
+    {
+      // copy the previous column and multply by p[dim]
+      for (std::size_t i = 0; i < column_details[deg][dim]; i++)
+        arr[loc_offset + i] = arr[prev_col + i] * p[dim];
+
+      loc_offset += column_details[deg][dim];
+    }
+
+    prev_col = curr_col;
+    curr_col = loc_offset;
+  }
+
+  return arr;
+}
+
+template <std::size_t Deg>
+static constexpr std::integral_constant<std::size_t, Deg> degree{};
+
+} // namespace Details
diff --git a/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
index 9d0d43551..ad357852d 100644
--- a/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
+++ b/examples/moving_least_squares/DetailsRadialBasisFunctions.hpp
@@ -17,11 +17,15 @@
 
 #define RBF_DECL(name)                                                         \
   template <int K>                                                             \
-  struct name
+  struct __##name;                                                             \
+                                                                               \
+  template <int K>                                                             \
+  static constexpr __##name<K> name                                            \
+  {}
 
 #define RBF_DEF(name, n, func)                                                 \
   template <>                                                                  \
-  struct name<n>                                                               \
+  struct __##name<n>                                                           \
   {                                                                            \
     template <typename T>                                                      \
     KOKKOS_INLINE_FUNCTION static T apply(T x)                                 \
@@ -33,34 +37,34 @@
 namespace Details
 {
 
-RBF_DECL(Wendland);
-RBF_DEF(Wendland, 0, (1 - x) * (1 - x));
-RBF_DEF(Wendland, 2, (1 - x) * (1 - x) * (1 - x) * (1 - x) * (4 * x + 1));
-RBF_DEF(Wendland, 4,
+RBF_DECL(wendland);
+RBF_DEF(wendland, 0, (1 - x) * (1 - x));
+RBF_DEF(wendland, 2, (1 - x) * (1 - x) * (1 - x) * (1 - x) * (4 * x + 1));
+RBF_DEF(wendland, 4,
         (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
             (35 * x * x + 18 * x + 3));
-RBF_DEF(Wendland, 6,
+RBF_DEF(wendland, 6,
         (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
             (1 - x) * (32 * x * x * x + 25 * x * x + 8 * x + 1));
 
-RBF_DECL(Wu);
-RBF_DEF(Wu, 2,
+RBF_DECL(wu);
+RBF_DEF(wu, 2,
         (1 - x) * (1 - x) * (1 - x) * (1 - x) *
             (3 * x * x * x + 12 * x + 16 * x + 4));
-RBF_DEF(Wu, 4,
+RBF_DEF(wu, 4,
         (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) * (1 - x) *
             (5 * x * x * x * x * x + 30 * x * x * x * x + 72 * x * x * x +
              82 * x * x + 36 * x + 6));
 
-RBF_DECL(Buhmann);
-RBF_DEF(Buhmann, 2,
+RBF_DECL(buhmann);
+RBF_DEF(buhmann, 2,
         2 * x * x * x * x * log(x) - T(7) / 2 * x * x * x * x +
             T(16) / 3 * x * x * x - 2 * x * x + T(1) / 6);
-RBF_DEF(Buhmann, 3,
+RBF_DEF(buhmann, 3,
         1 * x * x * x * x * x * x * x * x - T(84) / 5 * x * x * x * x * x * x +
             T(1024) / 5 * x * x * x * x * sqrt(x) - 378 * x * x * x * x +
             T(1024) / 5 * x * x * x * sqrt(x) - T(84) / 5 * x * x + 1);
-RBF_DEF(Buhmann, 4,
+RBF_DEF(buhmann, 4,
         T(99) / 35 * x * x * x * x * x * x * x * x -
             132 * x * x * x * x * x * x +
             T(9216) / 35 * x * x * x * x * x * sqrt(x) -
diff --git a/examples/moving_least_squares/MovingLeastSquares.hpp b/examples/moving_least_squares/MovingLeastSquares.hpp
index ce1382cd0..0f44b25a3 100644
--- a/examples/moving_least_squares/MovingLeastSquares.hpp
+++ b/examples/moving_least_squares/MovingLeastSquares.hpp
@@ -17,6 +17,7 @@
 
 #include "DetailsDistributedTreePostQueryComms.hpp"
 #include "DetailsMovingLeastSquaresComputation.hpp"
+#include "DetailsPolynomialBasis.hpp"
 
 namespace Details
 {
@@ -62,14 +63,15 @@ template <typename MemorySpace, typename FloatingCalculationType = float>
 class MovingLeastSquares
 {
 public:
-  template <typename ExecutionSpace, typename PolynomialBasis,
-            typename RadialBasisFunction, typename SourcePoints,
+  template <typename ExecutionSpace, typename RadialBasisFunction,
+            typename PolynomialDegree, typename SourcePoints,
             typename TargetPoints>
-  MovingLeastSquares(MPI_Comm comm, ExecutionSpace const &space,
-                     SourcePoints const &source_points,
-                     TargetPoints const &target_points,
-                     PolynomialBasis const &pb, RadialBasisFunction const &rbf,
-                     std::size_t num_neighbors = PolynomialBasis::size)
+  MovingLeastSquares(
+      MPI_Comm comm, ExecutionSpace const &space,
+      SourcePoints const &source_points, TargetPoints const &target_points,
+      PolynomialDegree const &pd, RadialBasisFunction const &rbf,
+      std::size_t num_neighbors = Details::polynomialBasisSizeFromAT<
+          SourcePoints, PolynomialDegree::value>)
   {
     // Organize the source points as a tree and create the predicates
     ArborX::DistributedTree<MemorySpace> source_tree(comm, space,
@@ -91,7 +93,7 @@ class MovingLeastSquares
     // Finally, compute the local MLS for the local target points
     _mlsc = Details::MovingLeastSquaresComputation<MemorySpace,
                                                    FloatingCalculationType>(
-        space, local_source_points, target_points, pb, rbf);
+        space, local_source_points, target_points, pd, rbf);
   }
 
   template <typename ExecutionSpace, typename SourceValues>
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 287103f0e..d4c364e8e 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -30,18 +30,6 @@
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
 
-struct MVPolynomialBasis_3D
-{
-  static constexpr std::size_t size = 10;
-
-  KOKKOS_INLINE_FUNCTION static Kokkos::Array<float, size>
-  basis(ArborX::Point const &p)
-  {
-    return {{1.f, p[0], p[1], p[2], p[0] * p[0], p[0] * p[1], p[0] * p[2],
-             p[1] * p[1], p[1] * p[2], p[2] * p[2]}};
-  }
-};
-
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
@@ -95,9 +83,9 @@ int main(int argc, char *argv[])
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Create the transform from a point cloud to another
-  MovingLeastSquares<MemorySpace, float> mls(
-      mpi_comm, space, source_points, target_points, MVPolynomialBasis_3D{},
-      Details::Wendland<0>{});
+  MovingLeastSquares<MemorySpace, float> mls(mpi_comm, space, source_points,
+                                             target_points, Details::degree<2>,
+                                             Details::wendland<0>);
 
   // Compute source values
   Kokkos::View<float *, MemorySpace> source_values("Example::source_values",

From bc388f873d1916769d4fea5e329207b38f3f5db1 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Tue, 22 Aug 2023 16:37:59 -0400
Subject: [PATCH 42/44] Hypergeometry (only works in 3D)

---
 .../DetailsMovingLeastSquaresComputation.hpp  | 56 ++++++++++++-------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index b16ed4cde..218dac4b7 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -21,6 +21,16 @@
 namespace Details
 {
 
+template <typename Point>
+using PointEquivalence = ArborX::ExperimentalHyperGeometry::Point<
+    ArborX::GeometryTraits::dimension_v<Point>,
+    typename ArborX::GeometryTraits::coordinate_type<Point>::type>;
+
+template <typename Points>
+using PointEquivalenceFromAT =
+    PointEquivalence<typename ArborX::Details::AccessTraitsHelper<
+        ArborX::AccessTraits<Points, ArborX::PrimitivesTag>>::type>;
+
 template <typename MemorySpace, typename CoefficientType>
 class MovingLeastSquaresComputation
 {
@@ -38,6 +48,7 @@ class MovingLeastSquaresComputation
   {
     using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
     using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+    using point_t = PointEquivalenceFromAT<SourcePoints>;
 
     _num_targets = tgt_acc::size(target_points);
     _num_neighbors = src_acc::size(source_points) / _num_targets;
@@ -48,7 +59,7 @@ class MovingLeastSquaresComputation
     // We center each group of points around the target as it ables us to
     // optimize the final computation and transfer point types into ours
     // TODO: Use multidimensional points!
-    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target =
+    Kokkos::View<point_t **, MemorySpace> source_ref_target =
         sourceRefTargetFill(space, source_points, target_points, _num_targets,
                             _num_neighbors);
 
@@ -115,7 +126,7 @@ class MovingLeastSquaresComputation
 
   template <typename ExecutionSpace, typename SourcePoints,
             typename TargetPoints>
-  static Kokkos::View<ArborX::Point **, MemorySpace>
+  static Kokkos::View<PointEquivalenceFromAT<SourcePoints> **, MemorySpace>
   sourceRefTargetFill(ExecutionSpace const &space,
                       SourcePoints const &source_points,
                       TargetPoints const &target_points,
@@ -123,8 +134,9 @@ class MovingLeastSquaresComputation
   {
     using src_acc = ArborX::AccessTraits<SourcePoints, ArborX::PrimitivesTag>;
     using tgt_acc = ArborX::AccessTraits<TargetPoints, ArborX::PrimitivesTag>;
+    using point_t = PointEquivalenceFromAT<SourcePoints>;
 
-    Kokkos::View<ArborX::Point **, MemorySpace> source_ref_target(
+    Kokkos::View<point_t **, MemorySpace> source_ref_target(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
                            "Example::MLSC::source_ref_target"),
         num_targets, num_neighbors);
@@ -136,25 +148,26 @@ class MovingLeastSquaresComputation
         KOKKOS_LAMBDA(int const i, int const j) {
           auto src = src_acc::get(source_points, i * num_neighbors + j);
           auto tgt = tgt_acc::get(target_points, i);
-          source_ref_target(i, j) = ArborX::Point{
-              src[0] - tgt[0],
-              src[1] - tgt[1],
-              src[2] - tgt[2],
-          };
+          point_t t{};
+
+          for (int k = 0; k < ArborX::GeometryTraits::dimension_v<point_t>; k++)
+            t[k] = src[k] - tgt[k];
+
+          source_ref_target(i, j) = t;
         });
 
     return source_ref_target;
   }
 
-  template <typename ExecutionSpace>
-  static Kokkos::View<CoefficientType *, MemorySpace> radiiComputation(
-      ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
-      std::size_t num_targets, std::size_t num_neighbors)
+  template <typename ExecutionSpace, typename Point>
+  static Kokkos::View<CoefficientType *, MemorySpace>
+  radiiComputation(ExecutionSpace const &space,
+                   Kokkos::View<Point **, MemorySpace> const &source_ref_target,
+                   std::size_t num_targets, std::size_t num_neighbors)
   {
     constexpr CoefficientType epsilon =
         std::numeric_limits<CoefficientType>::epsilon();
-    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
+    constexpr Point origin{};
 
     Kokkos::View<CoefficientType *, MemorySpace> radii(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::radii"),
@@ -180,15 +193,16 @@ class MovingLeastSquaresComputation
     return radii;
   }
 
-  template <typename ExecutionSpace, typename RadialBasisFunction>
+  template <typename ExecutionSpace, typename RadialBasisFunction,
+            typename Point>
   static Kokkos::View<CoefficientType **, MemorySpace> weightComputation(
       ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      Kokkos::View<Point **, MemorySpace> const &source_ref_target,
       Kokkos::View<CoefficientType *, MemorySpace> const &radii,
       std::size_t num_targets, std::size_t num_neighbors,
       RadialBasisFunction const &)
   {
-    constexpr ArborX::Point origin = ArborX::Point{0, 0, 0};
+    constexpr Point origin{};
 
     Kokkos::View<CoefficientType **, MemorySpace> phi(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::MLSC::phi"),
@@ -207,15 +221,15 @@ class MovingLeastSquaresComputation
     return phi;
   }
 
-  template <typename ExecutionSpace, typename PolynomialDegree>
+  template <typename ExecutionSpace, typename PolynomialDegree, typename Point>
   static Kokkos::View<CoefficientType ***, MemorySpace> vandermondeComputation(
       ExecutionSpace const &space,
-      Kokkos::View<ArborX::Point **, MemorySpace> const &source_ref_target,
+      Kokkos::View<Point **, MemorySpace> const &source_ref_target,
       std::size_t num_targets, std::size_t num_neighbors,
       PolynomialDegree const &)
   {
     static constexpr std::size_t polynomialBasisSize =
-        polynomialBasisSizeFromT<ArborX::Point, PolynomialDegree::value>;
+        polynomialBasisSizeFromT<Point, PolynomialDegree::value>;
 
     Kokkos::View<CoefficientType ***, MemorySpace> p(
         Kokkos::view_alloc(Kokkos::WithoutInitializing,
@@ -227,7 +241,7 @@ class MovingLeastSquaresComputation
         Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>>(
             space, {0, 0}, {num_targets, num_neighbors}),
         KOKKOS_LAMBDA(int const i, int const j) {
-          auto basis = polynomialBasis<ArborX::Point, PolynomialDegree::value>(
+          auto basis = polynomialBasis<Point, PolynomialDegree::value>(
               source_ref_target(i, j));
 
           for (int k = 0; k < polynomialBasisSize; k++)

From 220407cce1deb0c5349836a590583ab675801ff4 Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 23 Aug 2023 12:08:42 -0400
Subject: [PATCH 43/44] Using point clouds creation and gathering of the
 maximum error

---
 .../DetailsMovingLeastSquaresComputation.hpp  |   4 +-
 .../DetailsPolynomialBasis.hpp                |   1 +
 .../moving_least_squares.cpp                  | 147 ++++++++++++------
 3 files changed, 101 insertions(+), 51 deletions(-)

diff --git a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
index 218dac4b7..f3dad773b 100644
--- a/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
+++ b/examples/moving_least_squares/DetailsMovingLeastSquaresComputation.hpp
@@ -105,7 +105,9 @@ class MovingLeastSquaresComputation
     Kokkos::View<CoefficientType **, MemorySpace> coeffs = _coeffs;
 
     Kokkos::View<value_t *, memory_space> target_values(
-        "Example::MLSC::target_values", _num_targets);
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::MLSC::target_values"),
+        _num_targets);
 
     Kokkos::parallel_for(
         "Example::MLSC::target_interpolation",
diff --git a/examples/moving_least_squares/DetailsPolynomialBasis.hpp b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
index e4383b632..14dfde948 100644
--- a/examples/moving_least_squares/DetailsPolynomialBasis.hpp
+++ b/examples/moving_least_squares/DetailsPolynomialBasis.hpp
@@ -50,6 +50,7 @@ KOKKOS_FUNCTION constexpr std::size_t polynomialBasisSize()
 
   return size;
 }
+
 template <typename Point, std::size_t Deg>
 static constexpr std::size_t polynomialBasisSizeFromT =
     polynomialBasisSize<ArborX::GeometryTraits::dimension_v<Point>, Deg>();
diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index d4c364e8e..7b422f98c 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -20,9 +20,12 @@
 
 #include <Kokkos_Core.hpp>
 
+#include <iostream>
 #include <limits>
-#include <sstream>
+#include <tuple>
+#include <vector>
 
+#include "../../benchmarks/point_clouds/point_clouds.hpp"
 #include "DetailsRadialBasisFunctions.hpp"
 #include "MovingLeastSquares.hpp"
 #include <mpi.h>
@@ -33,7 +36,7 @@ using MemorySpace = ExecutionSpace::memory_space;
 // Function to approximate
 KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
 {
-  return p[2] + p[1];
+  return Kokkos::cos(5 * p[2]) * p[0] + p[1] + 1;
 }
 
 int main(int argc, char *argv[])
@@ -41,45 +44,64 @@ int main(int argc, char *argv[])
   MPI_Init(&argc, &argv);
   Kokkos::ScopeGuard guard(argc, argv);
 
-  constexpr std::size_t cube_side = 20;
-  constexpr std::size_t source_points_num = cube_side * cube_side * cube_side;
-  constexpr std::size_t target_points_num = 4;
-
   ExecutionSpace space{};
   MPI_Comm mpi_comm = MPI_COMM_WORLD;
   int mpi_size, mpi_rank;
   MPI_Comm_size(mpi_comm, &mpi_size);
   MPI_Comm_rank(mpi_comm, &mpi_rank);
 
-  std::size_t local_source_points_num = source_points_num / mpi_size;
+  static constexpr std::size_t total_source_points = 1024 * 512;
+  std::size_t local_source_points_num = total_source_points / mpi_size;
+  static constexpr std::size_t total_target_points = 1024;
+  std::size_t local_target_points_num = total_target_points / mpi_size;
+  static constexpr double cube_side = 5;
 
   Kokkos::View<ArborX::Point *, MemorySpace> source_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::source_points"),
       local_source_points_num);
+  auto source_points_host = Kokkos::create_mirror_view(source_points);
   Kokkos::View<ArborX::Point *, MemorySpace> target_points(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::target_points"),
-      target_points_num);
+      local_source_points_num);
   auto target_points_host = Kokkos::create_mirror_view(target_points);
 
-  // Generate source points (Organized within a [-10, 10]^3 cube)
-  std::size_t thickness = cube_side / mpi_size;
-  Kokkos::parallel_for(
-      "Example::source_points_init",
-      Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<3>>(
-          space, {0, 0, 0}, {cube_side, cube_side, thickness}),
-      KOKKOS_LAMBDA(int const i, int const j, int const k) {
-        source_points(i * cube_side * thickness + j * thickness +
-                      k) = ArborX::Point{
-            20.f * (float(i) / (cube_side - 1) - .5f),
-            20.f * (float(j) / (cube_side - 1) - .5f),
-            20.f * (float(k + thickness * mpi_rank) / (cube_side - 1) - .5f)};
-      });
+  // source and target points are within a 5x5x5 cube
+  if (mpi_rank == 0)
+  {
+    Kokkos::View<ArborX::Point *, Kokkos::HostSpace> all_source_points(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::all_source_points"),
+        total_source_points);
+    filledBoxCloud(cube_side / 2, all_source_points);
+    MPI_Scatter(
+        all_source_points.data(), local_source_points_num * 3 * sizeof(float),
+        MPI_BYTE, source_points_host.data(),
+        local_source_points_num * 3 * sizeof(float), MPI_BYTE, 0, mpi_comm);
+
+    Kokkos::View<ArborX::Point *, Kokkos::HostSpace> all_target_points(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Example::all_target_points"),
+        total_target_points);
+    filledBoxCloud(cube_side / 2, all_target_points);
+    MPI_Scatter(
+        all_target_points.data(), local_target_points_num * 3 * sizeof(float),
+        MPI_BYTE, target_points_host.data(),
+        local_target_points_num * 3 * sizeof(float), MPI_BYTE, 0, mpi_comm);
+  }
+  else
+  {
+    MPI_Scatter(nullptr, local_source_points_num * 3 * sizeof(float), MPI_BYTE,
+                source_points_host.data(),
+                local_source_points_num * 3 * sizeof(float), MPI_BYTE, 0,
+                mpi_comm);
+
+    MPI_Scatter(nullptr, local_target_points_num * 3 * sizeof(float), MPI_BYTE,
+                target_points_host.data(),
+                local_target_points_num * 3 * sizeof(float), MPI_BYTE, 0,
+                mpi_comm);
+  }
 
-  // Generate target points
-  target_points_host(0) = ArborX::Point{1.f, 0.f, 1.f};
-  target_points_host(1) = ArborX::Point{5.f, 5.f, 5.f};
-  target_points_host(2) = ArborX::Point{-5.f, 5.f, 3.f};
-  target_points_host(3) = ArborX::Point{1.f, -3.3f, 7.f};
+  Kokkos::deep_copy(space, source_points, source_points_host);
   Kokkos::deep_copy(space, target_points, target_points_host);
 
   // Create the transform from a point cloud to another
@@ -99,41 +121,66 @@ int main(int argc, char *argv[])
 
   // Compute target values from source ones
   auto target_values = mls.apply(space, source_values);
+  auto target_values_host = Kokkos::create_mirror_view(target_values);
+  Kokkos::deep_copy(space, target_values_host, target_values);
 
   // Compute target values via evaluation
   Kokkos::View<float *, MemorySpace> target_values_exact(
-      "Example::target_values_exact", target_points_num);
+      "Example::target_values_exact", local_target_points_num);
   Kokkos::parallel_for(
       "Example::target_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, target_points_num),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_target_points_num),
       KOKKOS_LAMBDA(int const i) {
         target_values_exact(i) = manufactured_solution(target_points(i));
       });
 
-  // Show difference
-  auto target_values_host = Kokkos::create_mirror_view(target_values);
-  Kokkos::deep_copy(space, target_values_host, target_values);
-  auto target_values_exact_host =
-      Kokkos::create_mirror_view(target_values_exact);
-  Kokkos::deep_copy(space, target_values_exact_host, target_values_exact);
-
-  std::stringstream ss{};
-  float error = 0.f;
-  for (int i = 0; i < target_points_num; i++)
+  // Compute local error
+  static constexpr float epsilon = std::numeric_limits<float>::epsilon();
+  using ErrType = typename Kokkos::MaxLoc<float, std::size_t>::value_type;
+  ErrType error{0, 0};
+  Kokkos::parallel_reduce(
+      "Example::error_computation",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_target_points_num),
+      KOKKOS_LAMBDA(int const i, ErrType &loc_error) {
+        float abs_error =
+            Kokkos::abs(target_values(i) - target_values_exact(i));
+        float abs_value = Kokkos::abs(target_values_exact(i)) +
+                          epsilon;
+
+        if (loc_error.val < abs_error / abs_value)
+        {
+          loc_error.val = abs_error / abs_value;
+          loc_error.loc = i;
+        }
+      },
+      Kokkos::MaxLoc<float, std::size_t>(error));
+
+  std::tuple<float, ArborX::Point, float> error_obj{
+      error.val, target_points_host(error.loc), target_values_host(error.loc)};
+
+  if (mpi_rank == 0)
   {
-    error = Kokkos::max(
-        Kokkos::abs(target_values_host(i) - target_values_exact_host(i)) /
-            Kokkos::abs(target_values_exact_host(i)),
-        error);
-
-    ss << mpi_rank << ": ==== Target " << i << '\n'
-       << mpi_rank << ": Interpolation: " << target_values_host(i) << '\n'
-       << mpi_rank << ": Real value   : " << target_values_exact_host(i)
-       << '\n';
+    std::vector<decltype(error_obj)> all_error_obj(mpi_size);
+    MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE,
+               all_error_obj.data(), sizeof(decltype(error_obj)), MPI_BYTE, 0,
+               mpi_comm);
+
+    for (int i = 0; i < mpi_size; i++)
+      if (std::get<0>(error_obj) < std::get<0>(all_error_obj[i]))
+        error_obj = all_error_obj[i];
+
+    float error = std::get<0>(error_obj), approx = std::get<2>(error_obj);
+    auto point = std::get<1>(error_obj);
+    std::cout << "Maximum error: " << error << " at point " << point[0] << ", "
+              << point[1] << ", " << point[2]
+              << "\nTrue value: " << manufactured_solution(point)
+              << "\nComputed: " << approx << std::endl;
+  }
+  else
+  {
+    MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE, nullptr,
+               sizeof(decltype(error_obj)), MPI_BYTE, 0, mpi_comm);
   }
-  ss << mpi_rank << ": Maximum relative error: " << error << std::endl;
-
-  std::cout << ss.str();
 
   MPI_Finalize();
   return 0;

From dcdedd3077e062b0275a1f32008b842402bdbdaf Mon Sep 17 00:00:00 2001
From: Yohann Bosqued <bosq.yohann@gmail.com>
Date: Wed, 23 Aug 2023 16:03:04 -0400
Subject: [PATCH 44/44] Back and forth MLS

---
 .../moving_least_squares.cpp                  | 292 +++++++++++-------
 1 file changed, 184 insertions(+), 108 deletions(-)

diff --git a/examples/moving_least_squares/moving_least_squares.cpp b/examples/moving_least_squares/moving_least_squares.cpp
index 7b422f98c..2a7b75583 100644
--- a/examples/moving_least_squares/moving_least_squares.cpp
+++ b/examples/moving_least_squares/moving_least_squares.cpp
@@ -9,19 +9,11 @@
  * SPDX-License-Identifier: BSD-3-Clause                                    *
  ****************************************************************************/
 
-// Example taken from DataTransferKit
-// (https://github.com/ORNL-CEES/DataTransferKit)
-// with MLS resolution from
-// (http://dx.doi.org/10.1016/j.jcp.2015.11.055)
-// and
-// (A conservative mesh-free approach for fluid-structure interface problems)
-
 #include <ArborX.hpp>
 
 #include <Kokkos_Core.hpp>
 
 #include <iostream>
-#include <limits>
 #include <tuple>
 #include <vector>
 
@@ -33,137 +25,160 @@
 using ExecutionSpace = Kokkos::DefaultExecutionSpace;
 using MemorySpace = ExecutionSpace::memory_space;
 
+using HostExecutionSpace = Kokkos::DefaultHostExecutionSpace;
+using HostMemorySpace = HostExecutionSpace::memory_space;
+
 // Function to approximate
-KOKKOS_INLINE_FUNCTION float manufactured_solution(ArborX::Point const &p)
+struct Step
 {
-  return Kokkos::cos(5 * p[2]) * p[0] + p[1] + 1;
-}
+  KOKKOS_INLINE_FUNCTION static float eval(ArborX::Point const &p)
+  {
+    return !Kokkos::signbit(p[0]) * 1.f;
+  }
 
-int main(int argc, char *argv[])
-{
-  MPI_Init(&argc, &argv);
-  Kokkos::ScopeGuard guard(argc, argv);
+  template <class... Properties>
+  static Kokkos::View<float *, Properties...>
+  map(ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, Properties...> const &ps)
+  {
+    Kokkos::View<float *, Properties...> evals("Example::evals", ps.extent(0));
+    Kokkos::parallel_for(
+        "Example::evaluation",
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, ps.extent(0)),
+        KOKKOS_LAMBDA(int const i) { evals(i) = eval(ps(i)); });
+    return evals;
+  }
+};
 
-  ExecutionSpace space{};
-  MPI_Comm mpi_comm = MPI_COMM_WORLD;
+Kokkos::Array<Kokkos::View<ArborX::Point *, MemorySpace>, 2>
+createPointClouds(HostExecutionSpace const &hspace, ExecutionSpace const &space,
+                  MPI_Comm comm, std::size_t points_num)
+{
   int mpi_size, mpi_rank;
-  MPI_Comm_size(mpi_comm, &mpi_size);
-  MPI_Comm_rank(mpi_comm, &mpi_rank);
+  MPI_Comm_size(comm, &mpi_size);
+  MPI_Comm_rank(comm, &mpi_rank);
+
+  Kokkos::Array<Kokkos::View<ArborX::Point *, HostMemorySpace>, 2>
+      point_clouds_host{Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                               "Example::points_cloud_0"),
+                            points_num),
+                        Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                               "Example::points_cloud_1"),
+                            points_num)};
 
-  static constexpr std::size_t total_source_points = 1024 * 512;
-  std::size_t local_source_points_num = total_source_points / mpi_size;
-  static constexpr std::size_t total_target_points = 1024;
-  std::size_t local_target_points_num = total_target_points / mpi_size;
-  static constexpr double cube_side = 5;
-
-  Kokkos::View<ArborX::Point *, MemorySpace> source_points(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::source_points"),
-      local_source_points_num);
-  auto source_points_host = Kokkos::create_mirror_view(source_points);
-  Kokkos::View<ArborX::Point *, MemorySpace> target_points(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Example::target_points"),
-      local_source_points_num);
-  auto target_points_host = Kokkos::create_mirror_view(target_points);
-
-  // source and target points are within a 5x5x5 cube
   if (mpi_rank == 0)
   {
-    Kokkos::View<ArborX::Point *, Kokkos::HostSpace> all_source_points(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::all_source_points"),
-        total_source_points);
-    filledBoxCloud(cube_side / 2, all_source_points);
-    MPI_Scatter(
-        all_source_points.data(), local_source_points_num * 3 * sizeof(float),
-        MPI_BYTE, source_points_host.data(),
-        local_source_points_num * 3 * sizeof(float), MPI_BYTE, 0, mpi_comm);
-
-    Kokkos::View<ArborX::Point *, Kokkos::HostSpace> all_target_points(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "Example::all_target_points"),
-        total_target_points);
-    filledBoxCloud(cube_side / 2, all_target_points);
-    MPI_Scatter(
-        all_target_points.data(), local_target_points_num * 3 * sizeof(float),
-        MPI_BYTE, target_points_host.data(),
-        local_target_points_num * 3 * sizeof(float), MPI_BYTE, 0, mpi_comm);
+    Kokkos::Array<Kokkos::View<ArborX::Point *, HostMemorySpace>, 2>
+        all_point_clouds{Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                             Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                                "Example::all_points_cloud_0"),
+                             points_num * mpi_size),
+                         Kokkos::View<ArborX::Point *, HostMemorySpace>(
+                             Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                                "Example::all_points_cloud_1"),
+                             points_num * mpi_size)};
+
+    filledBoxCloud(.5, all_point_clouds[0]);
+    filledBoxCloud(.5, all_point_clouds[1]);
+
+    MPI_Scatter(all_point_clouds[0].data(), points_num * 3 * sizeof(float),
+                MPI_BYTE, point_clouds_host[0].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+    MPI_Scatter(all_point_clouds[1].data(), points_num * 3 * sizeof(float),
+                MPI_BYTE, point_clouds_host[1].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
   }
   else
   {
-    MPI_Scatter(nullptr, local_source_points_num * 3 * sizeof(float), MPI_BYTE,
-                source_points_host.data(),
-                local_source_points_num * 3 * sizeof(float), MPI_BYTE, 0,
-                mpi_comm);
-
-    MPI_Scatter(nullptr, local_target_points_num * 3 * sizeof(float), MPI_BYTE,
-                target_points_host.data(),
-                local_target_points_num * 3 * sizeof(float), MPI_BYTE, 0,
-                mpi_comm);
+    MPI_Scatter(nullptr, 0, MPI_BYTE, point_clouds_host[0].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
+    MPI_Scatter(nullptr, 0, MPI_BYTE, point_clouds_host[1].data(),
+                points_num * 3 * sizeof(float), MPI_BYTE, 0, comm);
   }
 
-  Kokkos::deep_copy(space, source_points, source_points_host);
-  Kokkos::deep_copy(space, target_points, target_points_host);
-
-  // Create the transform from a point cloud to another
-  MovingLeastSquares<MemorySpace, float> mls(mpi_comm, space, source_points,
-                                             target_points, Details::degree<2>,
-                                             Details::wendland<0>);
-
-  // Compute source values
-  Kokkos::View<float *, MemorySpace> source_values("Example::source_values",
-                                                   local_source_points_num);
   Kokkos::parallel_for(
-      "Example::source_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_source_points_num),
+      "Example::flatten_points",
+      Kokkos::RangePolicy<HostExecutionSpace>(hspace, 0, points_num),
       KOKKOS_LAMBDA(int const i) {
-        source_values(i) = manufactured_solution(source_points(i));
+        point_clouds_host[0](i)[2] = 0;
+        point_clouds_host[1](i)[2] = 0;
       });
 
-  // Compute target values from source ones
-  auto target_values = mls.apply(space, source_values);
-  auto target_values_host = Kokkos::create_mirror_view(target_values);
-  Kokkos::deep_copy(space, target_values_host, target_values);
+  Kokkos::Array<Kokkos::View<ArborX::Point *, MemorySpace>, 2> point_clouds{
+      Kokkos::View<ArborX::Point *, MemorySpace>(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "Example::points_cloud_0"),
+          points_num),
+      Kokkos::View<ArborX::Point *, MemorySpace>(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "Example::points_cloud_1"),
+          points_num)};
+  Kokkos::deep_copy(space, point_clouds[0], point_clouds_host[0]);
+  Kokkos::deep_copy(space, point_clouds[1], point_clouds_host[1]);
 
-  // Compute target values via evaluation
-  Kokkos::View<float *, MemorySpace> target_values_exact(
-      "Example::target_values_exact", local_target_points_num);
-  Kokkos::parallel_for(
-      "Example::target_evaluation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_target_points_num),
-      KOKKOS_LAMBDA(int const i) {
-        target_values_exact(i) = manufactured_solution(target_points(i));
-      });
+  return point_clouds;
+}
+
+template <typename Deg, typename RBF>
+Kokkos::Array<MovingLeastSquares<MemorySpace, float>, 2> createMLSObjects(
+    MPI_Comm comm, ExecutionSpace const &space,
+    Kokkos::View<ArborX::Point *, MemorySpace> const &point_clouds_0,
+    Kokkos::View<ArborX::Point *, MemorySpace> const &point_clouds_1,
+    Deg const &deg, RBF const &rbf)
+{
+  return {MovingLeastSquares<MemorySpace, float>(comm, space, point_clouds_0,
+                                                 point_clouds_1, deg, rbf),
+          MovingLeastSquares<MemorySpace, float>(comm, space, point_clouds_1,
+                                                 point_clouds_0, deg, rbf)};
+}
+
+void doError(MPI_Comm comm, ExecutionSpace const &space,
+             Kokkos::View<ArborX::Point *, MemorySpace> const &points,
+             Kokkos::View<float *, MemorySpace> const &approx,
+             Kokkos::View<float *, MemorySpace> const &values)
+{
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(comm, &mpi_size);
+  MPI_Comm_rank(comm, &mpi_rank);
 
   // Compute local error
-  static constexpr float epsilon = std::numeric_limits<float>::epsilon();
   using ErrType = typename Kokkos::MaxLoc<float, std::size_t>::value_type;
   ErrType error{0, 0};
+  float error_sum = 0;
   Kokkos::parallel_reduce(
       "Example::error_computation",
-      Kokkos::RangePolicy<ExecutionSpace>(space, 0, local_target_points_num),
-      KOKKOS_LAMBDA(int const i, ErrType &loc_error) {
-        float abs_error =
-            Kokkos::abs(target_values(i) - target_values_exact(i));
-        float abs_value = Kokkos::abs(target_values_exact(i)) +
-                          epsilon;
-
-        if (loc_error.val < abs_error / abs_value)
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, approx.extent(0)),
+      KOKKOS_LAMBDA(int const i, ErrType &loc_error, float &loc_error_sum) {
+        float abs_error = Kokkos::abs(approx(i) - values(i));
+
+        loc_error_sum += abs_error;
+        if (loc_error.val < abs_error)
         {
-          loc_error.val = abs_error / abs_value;
+          loc_error.val = abs_error;
           loc_error.loc = i;
         }
       },
-      Kokkos::MaxLoc<float, std::size_t>(error));
+      Kokkos::MaxLoc<float, std::size_t>(error), Kokkos::Sum<float>(error_sum));
+
+  auto approx_host = Kokkos::create_mirror_view(approx);
+  auto points_host = Kokkos::create_mirror_view(points);
+  Kokkos::deep_copy(space, approx_host, approx);
+  Kokkos::deep_copy(space, points_host, points);
 
   std::tuple<float, ArborX::Point, float> error_obj{
-      error.val, target_points_host(error.loc), target_values_host(error.loc)};
+      error.val, points_host(error.loc), approx_host(error.loc)};
 
+  // Compute global error
   if (mpi_rank == 0)
   {
+    float error_sum_global;
     std::vector<decltype(error_obj)> all_error_obj(mpi_size);
     MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE,
                all_error_obj.data(), sizeof(decltype(error_obj)), MPI_BYTE, 0,
-               mpi_comm);
+               comm);
+    MPI_Reduce(&error_sum, &error_sum_global, 1, MPI_FLOAT, MPI_SUM, 0, comm);
 
     for (int i = 0; i < mpi_size; i++)
       if (std::get<0>(error_obj) < std::get<0>(all_error_obj[i]))
@@ -171,15 +186,76 @@ int main(int argc, char *argv[])
 
     float error = std::get<0>(error_obj), approx = std::get<2>(error_obj);
     auto point = std::get<1>(error_obj);
-    std::cout << "Maximum error: " << error << " at point " << point[0] << ", "
-              << point[1] << ", " << point[2]
-              << "\nTrue value: " << manufactured_solution(point)
-              << "\nComputed: " << approx << std::endl;
+    std::cout << "Mean error: "
+              << error_sum_global / (points.extent(0) * mpi_size)
+              << "\nMaximum error: " << error << " at point " << point[0]
+              << ", " << point[1] << "\n  True value:  " << Step::eval(point)
+              << "\n  Computed:    " << approx << std::endl;
   }
   else
   {
     MPI_Gather(&error_obj, sizeof(decltype(error_obj)), MPI_BYTE, nullptr,
-               sizeof(decltype(error_obj)), MPI_BYTE, 0, mpi_comm);
+               sizeof(decltype(error_obj)), MPI_BYTE, 0, comm);
+    MPI_Reduce(&error_sum, nullptr, 1, MPI_FLOAT, MPI_SUM, 0, comm);
+  }
+}
+
+Kokkos::View<float *, MemorySpace>
+doOne(MPI_Comm comm, ExecutionSpace const &space,
+      Kokkos::View<ArborX::Point *, MemorySpace> const &tgt,
+      Kokkos::View<float *, MemorySpace> const &values,
+      Kokkos::View<float *, MemorySpace> const &true_values,
+      MovingLeastSquares<MemorySpace, float> &mls)
+{
+  auto tgt_values = mls.apply(space, values);
+  doError(comm, space, tgt, tgt_values, true_values);
+  return tgt_values;
+}
+
+int main(int argc, char *argv[])
+{
+  static constexpr std::size_t total_points = 1024 * 128;
+  static constexpr std::size_t num_back_forth = 50;
+  static constexpr auto deg = Details::degree<4>;
+  static constexpr auto rbf = Details::wu<2>;
+
+  MPI_Init(&argc, &argv);
+  Kokkos::ScopeGuard guard(argc, argv);
+
+  ExecutionSpace space{};
+  HostExecutionSpace host_space{};
+  MPI_Comm mpi_comm = MPI_COMM_WORLD;
+  int mpi_size, mpi_rank;
+  MPI_Comm_size(mpi_comm, &mpi_size);
+  MPI_Comm_rank(mpi_comm, &mpi_rank);
+
+  auto point_clouds =
+      createPointClouds(host_space, space, mpi_comm, total_points / mpi_size);
+
+  // Create the transform from a point cloud to another
+  auto mlss = createMLSObjects(mpi_comm, space, point_clouds[0],
+                               point_clouds[1], deg, rbf);
+
+  Kokkos::Array<Kokkos::View<float *, MemorySpace>, 2> true_values{
+      Step::map(space, point_clouds[0]), Step::map(space, point_clouds[1])};
+
+  Kokkos::View<float *, MemorySpace> source_values = true_values[0];
+  for (int i = 0; i < num_back_forth * 2; i++)
+  {
+    if (mpi_rank == 0)
+      std::cout << "=== TURN " << i + 1 << std::endl;
+
+    Kokkos::View<ArborX::Point *, MemorySpace> target =
+        point_clouds[1 - (i % 2)];
+    Kokkos::View<float *, MemorySpace> tgt_true_values =
+        true_values[1 - (i % 2)];
+    MovingLeastSquares<MemorySpace, float> &mls = mlss[i % 2];
+
+    source_values =
+        doOne(mpi_comm, space, target, source_values, tgt_true_values, mls);
+
+    if (mpi_rank == 0)
+      std::cout << "===\n" << std::endl;
   }
 
   MPI_Finalize();