Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Voxel Downsample for Tensor interface #6249

Merged
merged 13 commits into from
Aug 11, 2023
50 changes: 28 additions & 22 deletions cpp/benchmarks/t/geometry/PointCloud.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,18 @@ void LegacyVoxelDownSample(benchmark::State& state, float voxel_size) {
void VoxelDownSample(benchmark::State& state,
const core::Device& device,
float voxel_size,
const core::HashBackendType& backend) {
const std::string& reduction) {
t::geometry::PointCloud pcd;
// t::io::CreatePointCloudFromFile lacks support of remove_inf_points and
// remove_nan_points
t::io::ReadPointCloud(path, pcd, {"auto", false, false, false});
pcd = pcd.To(device);

// Warm up.
pcd.VoxelDownSample(voxel_size, backend);
pcd.VoxelDownSample(voxel_size, reduction);

for (auto _ : state) {
pcd.VoxelDownSample(voxel_size, backend);
pcd.VoxelDownSample(voxel_size, reduction);
core::cuda::Synchronize(device);
}
}
Expand Down Expand Up @@ -387,28 +387,34 @@ BENCHMARK_CAPTURE(ToLegacyPointCloud, CUDA, core::Device("CUDA:0"))
->Unit(benchmark::kMillisecond);
#endif

#define ENUM_VOXELSIZE(DEVICE, BACKEND) \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_01, DEVICE, 0.01, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_02, DEVICE, 0.08, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_04, DEVICE, 0.04, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_08, DEVICE, 0.08, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_16, DEVICE, 0.16, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_32, DEVICE, 0.32, BACKEND) \
#define ENUM_VOXELSIZE(DEVICE, REDUCTION) \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_01, DEVICE, 0.01, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_02, DEVICE, 0.08, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_04, DEVICE, 0.04, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_08, DEVICE, 0.08, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_16, DEVICE, 0.16, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_32, DEVICE, 0.32, \
REDUCTION) \
->Unit(benchmark::kMillisecond);

const std::string kReductionMean = "mean";
#ifdef BUILD_CUDA_MODULE
#define ENUM_VOXELDOWNSAMPLE_BACKEND() \
ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::Slab) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::StdGPU)
#define ENUM_VOXELDOWNSAMPLE_REDUCTION() \
ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), kReductionMean)
#else
#define ENUM_VOXELDOWNSAMPLE_BACKEND() \
ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB)
#define ENUM_VOXELDOWNSAMPLE_REDUCTION() \
ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean)
#endif

BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_01, 0.01)
Expand All @@ -423,7 +429,7 @@ BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_16, 0.16)
->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_32, 0.32)
->Unit(benchmark::kMillisecond);
ENUM_VOXELDOWNSAMPLE_BACKEND()
ENUM_VOXELDOWNSAMPLE_REDUCTION()

BENCHMARK_CAPTURE(LegacyUniformDownSample, Legacy_2, 2)
->Unit(benchmark::kMillisecond);
Expand Down
3 changes: 3 additions & 0 deletions cpp/open3d/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ target_sources(core PRIVATE
kernel/BinaryEWCPU.cpp
kernel/IndexGetSet.cpp
kernel/IndexGetSetCPU.cpp
kernel/IndexReduction.cpp
kernel/IndexReductionCPU.cpp
kernel/Kernel.cpp
kernel/NonZero.cpp
kernel/NonZeroCPU.cpp
Expand Down Expand Up @@ -90,6 +92,7 @@ if (BUILD_CUDA_MODULE)
kernel/ArangeCUDA.cu
kernel/BinaryEWCUDA.cu
kernel/IndexGetSetCUDA.cu
kernel/IndexReductionCUDA.cu
kernel/NonZeroCUDA.cu
kernel/ReductionCUDA.cu
kernel/UnaryEWCUDA.cu
Expand Down
38 changes: 38 additions & 0 deletions cpp/open3d/core/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "open3d/core/TensorFunction.h"
#include "open3d/core/TensorKey.h"
#include "open3d/core/kernel/Arange.h"
#include "open3d/core/kernel/IndexReduction.h"
#include "open3d/core/kernel/Kernel.h"
#include "open3d/core/linalg/Det.h"
#include "open3d/core/linalg/Inverse.h"
Expand Down Expand Up @@ -955,6 +956,43 @@ void Tensor::IndexSet(const std::vector<Tensor>& index_tensors,
aip.GetIndexedShape(), aip.GetIndexedStrides());
}

void Tensor::IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src) {
if (index.NumDims() != 1) {
utility::LogError("IndexAdd_ only supports 1D index tensors.");
}

// Dim check.
if (dim < 0) {
utility::LogError("IndexAdd_ only supports sum at non-negative dim.");
}
if (NumDims() <= dim) {
utility::LogError("Sum dim {} exceeds tensor dim {}.", dim, NumDims());
}

// shape check
if (src.NumDims() != NumDims()) {
utility::LogError(
"IndexAdd_ only supports src tensor with same dimension as "
"this tensor.");
}
for (int64_t d = 0; d < NumDims(); ++d) {
if (d != dim && src.GetShape(d) != GetShape(d)) {
utility::LogError(
"IndexAdd_ only supports src tensor with same shape as "
"this "
"tensor except dim {}.",
dim);
}
}

// Type check.
AssertTensorDtype(index, core::Int64);
AssertTensorDtype(*this, src.GetDtype());

// Apply kernel.
kernel::IndexAdd_(dim, index, src, *this);
}

Tensor Tensor::Permute(const SizeVector& dims) const {
// Check dimension size
if (static_cast<int64_t>(dims.size()) != NumDims()) {
Expand Down
10 changes: 10 additions & 0 deletions cpp/open3d/core/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,16 @@ class Tensor : public IsDevice {
void IndexSet(const std::vector<Tensor>& index_tensors,
const Tensor& src_tensor);

/// \brief Advanced in-place reduction by index.
///
/// See
/// https://pytorch.org/docs/stable/generated/torch.Tensor.index_add_.html
///
/// self[index[i]] = operator(self[index[i]], src[i]).
///
/// Note: Only support 1D index and src tensors now.
void IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src);

/// \brief Permute (dimension shuffle) the Tensor, returns a view.
///
/// \param dims The desired ordering of dimensions.
Expand Down
49 changes: 49 additions & 0 deletions cpp/open3d/core/kernel/IndexReduction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#include "open3d/core/kernel/IndexReduction.h"

#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

void IndexAdd_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst) {
// Permute the reduction dimension to the first.
SizeVector permute = {};
for (int64_t d = 0; d <= dim; ++d) {
if (d == 0) {
permute.push_back(dim);
} else {
permute.push_back(d - 1);
}
}
for (int64_t d = dim + 1; d < src.NumDims(); ++d) {
permute.push_back(d);
}

auto src_permute = src.Permute(permute);
auto dst_permute = dst.Permute(permute);

if (dst.IsCPU()) {
IndexAddCPU_(dim, index, src_permute, dst_permute);
} else if (dst.IsCUDA()) {
#ifdef BUILD_CUDA_MODULE
IndexAddCUDA_(dim, index, src_permute, dst_permute);
#endif
} else {
utility::LogError("IndexAdd_: Unimplemented device");
}
}

} // namespace kernel
} // namespace core
} // namespace open3d
36 changes: 36 additions & 0 deletions cpp/open3d/core/kernel/IndexReduction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#pragma once

#include "open3d/core/Tensor.h"
#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

void IndexAdd_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);

void IndexAddCPU_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);

#ifdef BUILD_CUDA_MODULE
void IndexAddCUDA_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);
#endif

} // namespace kernel
} // namespace core
} // namespace open3d
79 changes: 79 additions & 0 deletions cpp/open3d/core/kernel/IndexReductionCPU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#include "open3d/core/Dispatch.h"
#include "open3d/core/Indexer.h"
#include "open3d/core/Tensor.h"
#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

template <typename func_t>
void LaunchIndexReductionKernel(int64_t dim,
const Device& device,
const Tensor& index,
const Tensor& src,
Tensor& dst,
const func_t& element_kernel) {
// index: [N,], src: [N, D], dst: [M, D]
// In Indexer, output shape defines the actual master strides.
// However, in IndexAdd_, input dominates the iterations.
// So put dst (output) at indexer's input, and src (input) at output.
Indexer indexer({dst}, src, DtypePolicy::NONE);

// Index is simply a 1D contiguous tensor, with a different stride
// behavior to src. So use raw pointer for simplicity.
auto index_ptr = index.GetDataPtr<int64_t>();

int64_t broadcasting_elems = 1;
for (int64_t d = 1; d < src.NumDims(); ++d) {
broadcasting_elems *= src.GetShape(d);
}
auto element_func = [=](int64_t workload_idx) {
int reduction_idx = workload_idx / broadcasting_elems;
int broadcasting_idx = workload_idx % broadcasting_elems;

const int64_t idx = index_ptr[reduction_idx];
int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;

void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
// Note input and output is switched here to adapt to the indexer
element_kernel(src_ptr, dst_ptr);
};

// TODO: check in detail
// No OpenMP could be faster, otherwise there would be thousands of atomics.
for (int64_t d = 0; d < indexer.NumWorkloads(); ++d) {
element_func(d);
}
}

template <typename scalar_t>
static OPEN3D_HOST_DEVICE void CPUSumKernel(const void* src, void* dst) {
scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
*dst_s_ptr += *src_s_ptr;
}

void IndexAddCPU_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst) {
DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
LaunchIndexReductionKernel(dim, src.GetDevice(), index, src, dst,
[](const void* src, void* dst) {
CPUSumKernel<scalar_t>(src, dst);
});
});
}

} // namespace kernel
} // namespace core
} // namespace open3d
Loading