From 7f932abf067202ab91af247d913c7c27f6c9ff7c Mon Sep 17 00:00:00 2001
From: "k.koide" <k.koide@aist.go.jp>
Date: Wed, 28 Aug 2024 10:18:08 +0900
Subject: [PATCH 1/2] batched overlap

---
 .../gtsam_points/types/gaussian_voxelmap.hpp  | 75 +++++++++++-------
 .../types/gaussian_voxelmap_gpu_funcs.cu      | 77 +++++++++++++++++++
 2 files changed, 125 insertions(+), 27 deletions(-)
diff --git a/include/gtsam_points/types/gaussian_voxelmap.hpp b/include/gtsam_points/types/gaussian_voxelmap.hpp
index 6717b2a4..f8a2dc35 100644
--- a/include/gtsam_points/types/gaussian_voxelmap.hpp
+++ b/include/gtsam_points/types/gaussian_voxelmap.hpp
@@ -39,10 +39,10 @@ double overlap(const GaussianVoxelMap::ConstPtr& target, const PointCloud::Const
 
 /**
  * @brief Calculate the fraction of points fell in targets' voxels
- * @param target   Set of target voxelized frames
- * @param source   Source frame
- * @param delta    Set of T_target_source
- * @return         Overlap rate
+ * @param target              Set of target voxelized frames
+ * @param source              Source frame
+ * @param Ts_target_source    Set of T_target_source
+ * @return                    Overlap rate
  */
 double overlap(
   const std::vector<GaussianVoxelMap::ConstPtr>& targets,
@@ -60,43 +60,64 @@ overlap(const std::vector<VoxelMapPtr>& targets, const PointCloud::ConstPtr& sou
 /**
  * @brief Calculate the fraction of points fell in target voxels on GPU
  * @note  Source points and target voxelmap must be pre-allocated on GPU.
- * @param target     Target voxelmap
- * @param source     Source frame
- * @param delta_gpu  T_target_source (on GPU memory)
- * @return         Overlap rate
+ * @param target               Target voxelmap
+ * @param source               Source frame
+ * @param T_target_source_gpu  T_target_source (on GPU memory)
+ * @return                     Overlap rate
  */
 double overlap_gpu(
   const GaussianVoxelMap::ConstPtr& target,
   const PointCloud::ConstPtr& source,
-  const Eigen::Isometry3f* delta_gpu,
+  const Eigen::Isometry3f* T_target_source_gpu,
   CUstream_st* stream = 0);
 
 /**
  * @brief Calculate the fraction of points fell in target voxels on GPU
  * @note  Source points and target voxelmap must be pre-allocated on GPU.
- * @param target  Target voxelmap
- * @param source  Source frame
- * @param delta   T_target_source
- * @return         Overlap rate
+ * @param target            Target voxelmap
+ * @param source            Source frame
+ * @param T_target_source   T_target_source
+ * @return                  Overlap rate
  */
-double
-overlap_gpu(const GaussianVoxelMap::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& delta, CUstream_st* stream = 0);
+double overlap_gpu(
+  const GaussianVoxelMap::ConstPtr& target,
+  const PointCloud::ConstPtr& source,
+  const Eigen::Isometry3d& T_target_source,
+  CUstream_st* stream = 0);
 
 /// @brief Calculate the fraction of points fell in target voxels on GPU
-double overlap_gpu(const PointCloud::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& delta, CUstream_st* stream = 0);
+double overlap_gpu(
+  const PointCloud::ConstPtr& target,
+  const PointCloud::ConstPtr& source,
+  const Eigen::Isometry3d& T_target_source,
+  CUstream_st* stream = 0);
 
 /**
  * @brief Calculate the fraction of points fell in targets' voxels on GPU
  * @note  Source points and targets' voxelmap must be pre-allocated on GPU.
- * @param targets Set of target voxelized frames
- * @param source  Source frame
- * @param deltas  Set of T_target_source
- * @return         Overlap rate
+ * @param targets           Set of target voxelized frames
+ * @param source            Source frame
+ * @param Ts_target_source  Set of T_target_source
+ * @return                  Overlap rate
  */
 double overlap_gpu(
   const std::vector<GaussianVoxelMap::ConstPtr>& targets,
   const PointCloud::ConstPtr& source,
-  const std::vector<Eigen::Isometry3d>& deltas,
+  const std::vector<Eigen::Isometry3d>& Ts_target_source,
+  CUstream_st* stream = 0);
+
+/**
+ * @breif Batch calculation of the fraction of points fell in targets' voxels on GPU
+ * @note  Source points and targets' voxelmap must be pre-allocated on GPU.
+ * @param targets           Set of target voxelized frames
+ * @param sources           Set of source frames
+ * @param Ts_target_source  Set of T_target_source
+ * @return                  Overlap rate
+ */
+std::vector<double> overlap_gpu(
+  const std::vector<GaussianVoxelMap::ConstPtr>& targets,
+  const std::vector<PointCloud::ConstPtr>& sources,
+  const std::vector<Eigen::Isometry3d>& Ts_target_source,
   CUstream_st* stream = 0);
 
 /// @brief Calculate the fraction of points fell in targets' voxels on GPU
@@ -104,25 +125,25 @@ template <typename VoxelMapPtr>
 std::enable_if_t<!std::is_same_v<VoxelMapPtr, GaussianVoxelMap::ConstPtr>, double> overlap_gpu(
   const std::vector<VoxelMapPtr>& targets,
   const PointCloud::ConstPtr& source,
-  const std::vector<Eigen::Isometry3d>& deltas,
+  const std::vector<Eigen::Isometry3d>& Ts_target_source,
   CUstream_st* stream = 0) {
   const std::vector<GaussianVoxelMap::ConstPtr> targets_(targets.begin(), targets.end());
-  return overlap_gpu(targets_, source, deltas, stream);
+  return overlap_gpu(targets_, source, Ts_target_source, stream);
 }
 
 // Automatically select CPU or GPU method
-double overlap_auto(const GaussianVoxelMap::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& delta);
+double overlap_auto(const GaussianVoxelMap::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& T_target_source);
 
 double overlap_auto(
   const std::vector<GaussianVoxelMap::ConstPtr>& targets,
   const PointCloud::ConstPtr& source,
-  const std::vector<Eigen::Isometry3d>& deltas);
+  const std::vector<Eigen::Isometry3d>& Ts_target_source);
 
 template <typename VoxelMapPtr>
 std::enable_if_t<!std::is_same_v<VoxelMapPtr, GaussianVoxelMap::ConstPtr>, double>
-overlap_auto(const std::vector<VoxelMapPtr>& targets, const PointCloud::ConstPtr& source, const std::vector<Eigen::Isometry3d>& deltas) {
+overlap_auto(const std::vector<VoxelMapPtr>& targets, const PointCloud::ConstPtr& source, const std::vector<Eigen::Isometry3d>& Ts_target_source) {
   const std::vector<GaussianVoxelMap::ConstPtr> targets_(targets.begin(), targets.end());
-  return overlap_auto(targets_, source, deltas);
+  return overlap_auto(targets_, source, Ts_target_source);
 }
 
 }  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu b/src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
index 36e6f5cd..9ccdae46 100644
--- a/src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
+++ b/src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
@@ -285,4 +285,81 @@ double overlap_gpu(
   return static_cast<double>(num_inliers_cpu) / source->size();
 }
 
+std::vector<double> overlap_gpu(
+  const std::vector<GaussianVoxelMap::ConstPtr>& targets_,
+  const std::vector<PointCloud::ConstPtr>& sources,
+  const std::vector<Eigen::Isometry3d>& Ts_target_source_,
+  CUstream_st* stream) {
+  if (targets_.size() != sources.size()) {
+    std::cerr << "error: The number of target voxelmaps and source point clouds must be the same!!" << std::endl;
+    abort();
+  }
+
+  size_t max_num_points = 0;
+
+  std::vector<GaussianVoxelMapGPU::ConstPtr> targets(targets_.size());
+  for (int i = 0; i < targets_.size(); i++) {
+    targets[i] = std::dynamic_pointer_cast<const GaussianVoxelMapGPU>(targets_[i]);
+    if (!targets[i]) {
+      std::cerr << "error: Failed to cast target voxelmap to GaussianVoxelMapGPU!!" << std::endl;
+    }
+
+    if (!sources[i]->has_points_gpu()) {
+      std::cerr << "error: GPU source points have not been allocated!!" << std::endl;
+    }
+
+    max_num_points = std::max(max_num_points, sources[i]->size());
+  }
+
+  std::vector<Eigen::Isometry3f> h_deltas(Ts_target_source_.size());
+  std::transform(Ts_target_source_.begin(), Ts_target_source_.end(), h_deltas.begin(), [](const Eigen::Isometry3d& delta) {
+    return delta.cast<float>();
+  });
+
+  Eigen::Isometry3f* deltas;
+  check_error << cudaMallocAsync(&deltas, sizeof(Eigen::Isometry3f) * Ts_target_source_.size(), stream);
+  check_error << cudaMemcpyAsync(deltas, h_deltas.data(), sizeof(Eigen::Isometry3f) * Ts_target_source_.size(), cudaMemcpyHostToDevice, stream);
+
+  bool* overlap;
+  check_error << cudaMallocAsync(&overlap, sizeof(bool) * max_num_points, stream);
+
+  int* num_inliers;
+  check_error << cudaMallocAsync(&num_inliers, sizeof(float) * sources.size(), stream);
+
+  char* temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+
+  for (int i = 0; i < targets_.size(); i++) {
+    const auto& source = sources[i];
+    const auto& target = targets[i];
+    thrust::transform(
+      thrust::cuda::par_nosync.on(stream),
+      thrust::device_ptr<Eigen::Vector3f>(source->points_gpu),
+      thrust::device_ptr<Eigen::Vector3f>(source->points_gpu) + source->size(),
+      thrust::device_ptr<bool>(overlap),
+      overlap_count_kernel(*target, thrust::device_ptr<const Eigen::Isometry3f>(deltas + i)));
+
+    if (i == 0) {
+      cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, overlap, num_inliers + i, source->size(), thrust::plus<int>(), 0, stream);
+      check_error << cudaMallocAsync(&temp_storage, temp_storage_bytes, stream);
+    }
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, overlap, num_inliers + i, source->size(), thrust::plus<int>(), 0, stream);
+  }
+
+  std::vector<int> h_num_inliers(sources.size());
+  check_error << cudaMemcpyAsync(h_num_inliers.data(), num_inliers, sizeof(int) * sources.size(), cudaMemcpyDeviceToHost, stream);
+
+  std::vector<double> overlaps(sources.size());
+  for (int i = 0; i < sources.size(); i++) {
+    overlaps[i] = static_cast<double>(h_num_inliers[i]) / sources[i]->size();
+  }
+
+  check_error << cudaFreeAsync(deltas, stream);
+  check_error << cudaFreeAsync(overlap, stream);
+  check_error << cudaFreeAsync(temp_storage, stream);
+  check_error << cudaFreeAsync(num_inliers, stream);
+
+  return overlaps;
+}
+
 }  // namespace gtsam_points
\ No newline at end of file

From 091ba0566090f5a12b99c1555ffb3160ffe89cba Mon Sep 17 00:00:00 2001
From: "k.koide" <k.koide@aist.go.jp>
Date: Wed, 28 Aug 2024 11:29:04 +0900
Subject: [PATCH 2/2] auto type conversion

---
 include/gtsam_points/types/gaussian_voxelmap.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/gtsam_points/types/gaussian_voxelmap.hpp b/include/gtsam_points/types/gaussian_voxelmap.hpp
index f8a2dc35..b0e0ccdf 100644
--- a/include/gtsam_points/types/gaussian_voxelmap.hpp
+++ b/include/gtsam_points/types/gaussian_voxelmap.hpp
@@ -131,6 +131,20 @@ std::enable_if_t<!std::is_same_v<VoxelMapPtr, GaussianVoxelMap::ConstPtr>, doubl
   return overlap_gpu(targets_, source, Ts_target_source, stream);
 }
 
+/// @brief Calculate the fraction of points fell in targets' voxels on GPU
+template <typename VoxelMapPtr, typename PointCloudPtr>
+std::
+  enable_if_t<!std::is_same_v<VoxelMapPtr, GaussianVoxelMap::ConstPtr> || !std::is_same_v<PointCloudPtr, PointCloud::ConstPtr>, std::vector<double>>
+  overlap_gpu(
+    const std::vector<VoxelMapPtr>& targets,
+    const std::vector<PointCloudPtr>& sources,
+    const std::vector<Eigen::Isometry3d>& Ts_target_source,
+    CUstream_st* stream = 0) {
+  const std::vector<GaussianVoxelMap::ConstPtr> targets_(targets.begin(), targets.end());
+  const std::vector<PointCloud::ConstPtr> sources_(sources.begin(), sources.end());
+  return overlap_gpu(targets_, sources_, Ts_target_source, stream);
+}
+
 // Automatically select CPU or GPU method
 double overlap_auto(const GaussianVoxelMap::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& T_target_source);