isl-org · theNded · Aug 11, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/cpp/benchmarks/t/geometry/PointCloud.cpp b/cpp/benchmarks/t/geometry/PointCloud.cpp
@@ -69,18 +69,18 @@ void LegacyVoxelDownSample(benchmark::State& state, float voxel_size) {
 void VoxelDownSample(benchmark::State& state,
                      const core::Device& device,
                      float voxel_size,
-                     const core::HashBackendType& backend) {
+                     const std::string& reduction) {
     t::geometry::PointCloud pcd;
     // t::io::CreatePointCloudFromFile lacks support of remove_inf_points and
     // remove_nan_points
     t::io::ReadPointCloud(path, pcd, {"auto", false, false, false});
     pcd = pcd.To(device);
 
     // Warm up.
-    pcd.VoxelDownSample(voxel_size, backend);
+    pcd.VoxelDownSample(voxel_size, reduction);
 
     for (auto _ : state) {
-        pcd.VoxelDownSample(voxel_size, backend);
+        pcd.VoxelDownSample(voxel_size, reduction);
         core::cuda::Synchronize(device);
     }
 }
@@ -387,28 +387,34 @@ BENCHMARK_CAPTURE(ToLegacyPointCloud, CUDA, core::Device("CUDA:0"))
         ->Unit(benchmark::kMillisecond);
 #endif
 
-#define ENUM_VOXELSIZE(DEVICE, BACKEND)                                       \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_01, DEVICE, 0.01, BACKEND) \
-            ->Unit(benchmark::kMillisecond);                                  \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_02, DEVICE, 0.08, BACKEND) \
-            ->Unit(benchmark::kMillisecond);                                  \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_04, DEVICE, 0.04, BACKEND) \
-            ->Unit(benchmark::kMillisecond);                                  \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_08, DEVICE, 0.08, BACKEND) \
-            ->Unit(benchmark::kMillisecond);                                  \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_16, DEVICE, 0.16, BACKEND) \
-            ->Unit(benchmark::kMillisecond);                                  \
-    BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_32, DEVICE, 0.32, BACKEND) \
+#define ENUM_VOXELSIZE(DEVICE, REDUCTION)                              \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_01, DEVICE, 0.01, \
+                      REDUCTION)                                       \
+            ->Unit(benchmark::kMillisecond);                           \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_02, DEVICE, 0.08, \
+                      REDUCTION)                                       \
+            ->Unit(benchmark::kMillisecond);                           \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_04, DEVICE, 0.04, \
+                      REDUCTION)                                       \
+            ->Unit(benchmark::kMillisecond);                           \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_08, DEVICE, 0.08, \
+                      REDUCTION)                                       \
+            ->Unit(benchmark::kMillisecond);                           \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_16, DEVICE, 0.16, \
+                      REDUCTION)                                       \
+            ->Unit(benchmark::kMillisecond);                           \
+    BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_32, DEVICE, 0.32, \
+                      REDUCTION)                                       \
             ->Unit(benchmark::kMillisecond);
 
+const std::string kReductionMean = "mean";
 #ifdef BUILD_CUDA_MODULE
-#define ENUM_VOXELDOWNSAMPLE_BACKEND()                                  \
-    ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB)   \
-    ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::Slab) \
-    ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::StdGPU)
+#define ENUM_VOXELDOWNSAMPLE_REDUCTION()                  \
+    ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean) \
+    ENUM_VOXELSIZE(core::Device("CUDA:0"), kReductionMean)
 #else
-#define ENUM_VOXELDOWNSAMPLE_BACKEND() \
-    ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB)
+#define ENUM_VOXELDOWNSAMPLE_REDUCTION() \
+    ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean)
 #endif
 
 BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_01, 0.01)
@@ -423,7 +429,7 @@ BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_16, 0.16)
         ->Unit(benchmark::kMillisecond);
 BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_32, 0.32)
         ->Unit(benchmark::kMillisecond);
-ENUM_VOXELDOWNSAMPLE_BACKEND()
+ENUM_VOXELDOWNSAMPLE_REDUCTION()
 
 BENCHMARK_CAPTURE(LegacyUniformDownSample, Legacy_2, 2)
         ->Unit(benchmark::kMillisecond);

diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
@@ -47,6 +47,8 @@ target_sources(core PRIVATE
     kernel/BinaryEWCPU.cpp
     kernel/IndexGetSet.cpp
     kernel/IndexGetSetCPU.cpp
+    kernel/IndexReduction.cpp
+    kernel/IndexReductionCPU.cpp
     kernel/Kernel.cpp
     kernel/NonZero.cpp
     kernel/NonZeroCPU.cpp
@@ -90,6 +92,7 @@ if (BUILD_CUDA_MODULE)
         kernel/ArangeCUDA.cu
         kernel/BinaryEWCUDA.cu
         kernel/IndexGetSetCUDA.cu
+        kernel/IndexReductionCUDA.cu
         kernel/NonZeroCUDA.cu
         kernel/ReductionCUDA.cu
         kernel/UnaryEWCUDA.cu

diff --git a/cpp/open3d/core/Tensor.cpp b/cpp/open3d/core/Tensor.cpp
@@ -22,6 +22,7 @@
 #include "open3d/core/TensorFunction.h"
 #include "open3d/core/TensorKey.h"
 #include "open3d/core/kernel/Arange.h"
+#include "open3d/core/kernel/IndexReduction.h"
 #include "open3d/core/kernel/Kernel.h"
 #include "open3d/core/linalg/Det.h"
 #include "open3d/core/linalg/Inverse.h"
@@ -955,6 +956,43 @@ void Tensor::IndexSet(const std::vector<Tensor>& index_tensors,
                      aip.GetIndexedShape(), aip.GetIndexedStrides());
 }
 
+void Tensor::IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src) {
+    if (index.NumDims() != 1) {
+        utility::LogError("IndexAdd_ only supports 1D index tensors.");
+    }
+
+    // Dim check.
+    if (dim < 0) {
+        utility::LogError("IndexAdd_ only supports sum at non-negative dim.");
+    }
+    if (NumDims() <= dim) {
+        utility::LogError("Sum dim {} exceeds tensor dim {}.", dim, NumDims());
+    }
+
+    // shape check
+    if (src.NumDims() != NumDims()) {
+        utility::LogError(
+                "IndexAdd_ only supports src tensor with same dimension as "
+                "this tensor.");
+    }
+    for (int64_t d = 0; d < NumDims(); ++d) {
+        if (d != dim && src.GetShape(d) != GetShape(d)) {
+            utility::LogError(
+                    "IndexAdd_ only supports src tensor with same shape as "
+                    "this "
+                    "tensor except dim {}.",
+                    dim);
+        }
+    }
+
+    // Type check.
+    AssertTensorDtype(index, core::Int64);
+    AssertTensorDtype(*this, src.GetDtype());
+
+    // Apply kernel.
+    kernel::IndexAdd_(dim, index, src, *this);
+}
+
 Tensor Tensor::Permute(const SizeVector& dims) const {
     // Check dimension size
     if (static_cast<int64_t>(dims.size()) != NumDims()) {

diff --git a/cpp/open3d/core/Tensor.h b/cpp/open3d/core/Tensor.h
@@ -575,6 +575,16 @@ class Tensor : public IsDevice {
     void IndexSet(const std::vector<Tensor>& index_tensors,
                   const Tensor& src_tensor);
 
+    /// \brief Advanced in-place reduction by index.
+    ///
+    /// See
+    /// https://pytorch.org/docs/stable/generated/torch.Tensor.index_add_.html
+    ///
+    /// self[index[i]] = operator(self[index[i]], src[i]).
+    ///
+    /// Note: Only support 1D index and src tensors now.
+    void IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src);
+
     /// \brief Permute (dimension shuffle) the Tensor, returns a view.
     ///
     /// \param dims The desired ordering of dimensions.

diff --git a/cpp/open3d/core/kernel/IndexReduction.cpp b/cpp/open3d/core/kernel/IndexReduction.cpp
@@ -0,0 +1,49 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2023 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/kernel/IndexReduction.h"
+
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexAdd_(int64_t dim,
+               const Tensor& index,
+               const Tensor& src,
+               Tensor& dst) {
+    // Permute the reduction dimension to the first.
+    SizeVector permute = {};
+    for (int64_t d = 0; d <= dim; ++d) {
+        if (d == 0) {
+            permute.push_back(dim);
+        } else {
+            permute.push_back(d - 1);
+        }
+    }
+    for (int64_t d = dim + 1; d < src.NumDims(); ++d) {
+        permute.push_back(d);
+    }
+
+    auto src_permute = src.Permute(permute);
+    auto dst_permute = dst.Permute(permute);
+
+    if (dst.IsCPU()) {
+        IndexAddCPU_(dim, index, src_permute, dst_permute);
+    } else if (dst.IsCUDA()) {
+#ifdef BUILD_CUDA_MODULE
+        IndexAddCUDA_(dim, index, src_permute, dst_permute);
+#endif
+    } else {
+        utility::LogError("IndexAdd_: Unimplemented device");
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h
@@ -0,0 +1,36 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2023 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include "open3d/core/Tensor.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexAdd_(int64_t dim,
+               const Tensor& index,
+               const Tensor& src,
+               Tensor& dst);
+
+void IndexAddCPU_(int64_t dim,
+                  const Tensor& index,
+                  const Tensor& src,
+                  Tensor& dst);
+
+#ifdef BUILD_CUDA_MODULE
+void IndexAddCUDA_(int64_t dim,
+                   const Tensor& index,
+                   const Tensor& src,
+                   Tensor& dst);
+#endif
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexReductionCPU.cpp b/cpp/open3d/core/kernel/IndexReductionCPU.cpp
@@ -0,0 +1,79 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2023 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+template <typename func_t>
+void LaunchIndexReductionKernel(int64_t dim,
+                                const Device& device,
+                                const Tensor& index,
+                                const Tensor& src,
+                                Tensor& dst,
+                                const func_t& element_kernel) {
+    // index: [N,], src: [N, D], dst: [M, D]
+    // In Indexer, output shape defines the actual master strides.
+    // However, in IndexAdd_, input dominates the iterations.
+    // So put dst (output) at indexer's input, and src (input) at output.
+    Indexer indexer({dst}, src, DtypePolicy::NONE);
+
+    // Index is simply a 1D contiguous tensor, with a different stride
+    // behavior to src. So use raw pointer for simplicity.
+    auto index_ptr = index.GetDataPtr<int64_t>();
+
+    int64_t broadcasting_elems = 1;
+    for (int64_t d = 1; d < src.NumDims(); ++d) {
+        broadcasting_elems *= src.GetShape(d);
+    }
+    auto element_func = [=](int64_t workload_idx) {
+        int reduction_idx = workload_idx / broadcasting_elems;
+        int broadcasting_idx = workload_idx % broadcasting_elems;
+
+        const int64_t idx = index_ptr[reduction_idx];
+        int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
+
+        void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
+        void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
+        // Note input and output is switched here to adapt to the indexer
+        element_kernel(src_ptr, dst_ptr);
+    };
+
+    // TODO: check in detail
+    // No OpenMP could be faster, otherwise there would be thousands of atomics.
+    for (int64_t d = 0; d < indexer.NumWorkloads(); ++d) {
+        element_func(d);
+    }
+}
+
+template <typename scalar_t>
+static OPEN3D_HOST_DEVICE void CPUSumKernel(const void* src, void* dst) {
+    scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
+    const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
+    *dst_s_ptr += *src_s_ptr;
+}
+
+void IndexAddCPU_(int64_t dim,
+                  const Tensor& index,
+                  const Tensor& src,
+                  Tensor& dst) {
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+        LaunchIndexReductionKernel(dim, src.GetDevice(), index, src, dst,
+                                   [](const void* src, void* dst) {
+                                       CPUSumKernel<scalar_t>(src, dst);
+                                   });
+    });
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d