From f34a2270b4dea7f8bce25f5ca8028e01385df1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= Date: Thu, 19 Sep 2024 17:34:42 +0200 Subject: [PATCH] Optimize TensorList::Resize (#5638) * simple inline functions are moved to the header * shared_ptr in ShareData is now passed by value, allowing move semantics and reducing the number of atomic operations * some code motion to improve inlining (e.g. wrapping frequent calls to DLL_PUBLIC functions into a trampoline function) --------- Signed-off-by: Michal Zientkiewicz --- dali/pipeline/data/buffer.cc | 4 +- dali/pipeline/data/buffer.h | 17 +++--- dali/pipeline/data/tensor.h | 7 ++- dali/pipeline/data/tensor_list.cc | 75 +++++--------------------- dali/pipeline/data/tensor_list.h | 41 +++++++++----- dali/pipeline/data/tensor_list_test.cc | 23 +++++++- 6 files changed, 79 insertions(+), 88 deletions(-) diff --git a/dali/pipeline/data/buffer.cc b/dali/pipeline/data/buffer.cc index 5d056407fd..0244f4595f 100644 --- a/dali/pipeline/data/buffer.cc +++ b/dali/pipeline/data/buffer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -63,7 +63,7 @@ DLL_PUBLIC shared_ptr AllocBuffer(size_t bytes, bool pinned, } DLL_PUBLIC bool RestrictPinnedMemUsage() { - static bool val = []() { + static const bool val = []() { const char *env = getenv("DALI_RESTRICT_PINNED_MEM"); return env && atoi(env); }(); diff --git a/dali/pipeline/data/buffer.h b/dali/pipeline/data/buffer.h index 4aebebb171..93a21b8c09 100644 --- a/dali/pipeline/data/buffer.h +++ b/dali/pipeline/data/buffer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -302,7 +302,7 @@ class DLL_PUBLIC Buffer { return !!data_; } - std::shared_ptr get_data_ptr() const { + const std::shared_ptr &get_data_ptr() const { return data_; } @@ -549,7 +549,7 @@ class DLL_PUBLIC Buffer { * * @remark If order is empty, current order is used. */ - inline void set_backing_allocation(const shared_ptr &ptr, size_t bytes, bool pinned, + inline void set_backing_allocation(shared_ptr ptr, size_t bytes, bool pinned, DALIDataType type, size_t size, int device_id, AccessOrder order = {}) { if (!same_managed_object(data_, ptr)) @@ -562,7 +562,7 @@ class DLL_PUBLIC Buffer { // Fill the remaining members in the order as they appear in class. type_ = TypeTable::GetTypeInfo(type); - data_ = ptr; + data_ = std::move(ptr); allocate_ = {}; size_ = size; shares_data_ = data_ != nullptr; @@ -674,7 +674,10 @@ class DLL_PUBLIC Buffer { static double growth_factor_; static double shrink_threshold_; - static bool default_pinned(); + static bool default_pinned() { + static const bool pinned = !RestrictPinnedMemUsage(); + return pinned; + } TypeInfo type_ = {}; // Data type of underlying storage shared_ptr data_ = nullptr; // Pointer to underlying storage @@ -683,8 +686,8 @@ class DLL_PUBLIC Buffer { size_t num_bytes_ = 0; // To keep track of the true size of the underlying allocation int device_ = CPU_ONLY_DEVICE_ID; // device the buffer was allocated on AccessOrder order_ = AccessOrder::host(); // The order of memory access (host or device) - bool shares_data_ = false; // Whether we aren't using our own allocation - bool pinned_ = !RestrictPinnedMemUsage(); // Whether the allocation uses pinned memory + bool shares_data_ = false; // Whether we aren't using our own allocation + bool pinned_ = default_pinned(); // Whether the allocation uses pinned memory }; template diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h index 5c25939a3b..3088b4365c 100644 --- a/dali/pipeline/data/tensor.h +++ b/dali/pipeline/data/tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -44,7 +44,6 @@ class Tensor : public Buffer { inline Tensor() {} inline ~Tensor() override = default; - /** * * @brief For tensor T of shape (s_0, s_1, ..., s_{n-1}) returns a n-1 dimensional tensor T' @@ -226,7 +225,7 @@ class Tensor : public Buffer { * individually. The device_id describes the location of the memory and the order can describe * the dependency on the work that is happening on another device. */ - inline void ShareData(const shared_ptr &ptr, size_t bytes, bool pinned, + inline void ShareData(shared_ptr ptr, size_t bytes, bool pinned, const TensorShape<> &shape, DALIDataType type, int device_id, AccessOrder order = {}) { Index new_size = volume(shape); @@ -243,7 +242,7 @@ class Tensor : public Buffer { // Save our new pointer and bytes. Reset our type, shape, and size type_ = TypeTable::GetTypeInfo(type); - data_ = ptr; + data_ = std::move(ptr); size_ = new_size; num_bytes_ = bytes; device_ = device_id; diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc index 8fd0d0f22a..13c33bd809 100644 --- a/dali/pipeline/data/tensor_list.cc +++ b/dali/pipeline/data/tensor_list.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -301,7 +301,7 @@ void TensorList::SetSample(int sample_idx, const Tensor &owner template -void TensorList::SetSample(int sample_idx, const shared_ptr &ptr, size_t bytes, +void TensorList::SetSample(int sample_idx, shared_ptr ptr, size_t bytes, bool pinned, const TensorShape<> &shape, DALIDataType type, int device_id, AccessOrder order, const TensorLayout &layout) { // Bounds check @@ -316,7 +316,7 @@ void TensorList::SetSample(int sample_idx, const shared_ptr &ptr, // Setting a new share overwrites the previous one - so we can safely assume that even if // we had a sample sharing into TL, it will be overwritten - tensors_[sample_idx].ShareData(ptr, bytes, pinned, shape, type, device_id, order); + tensors_[sample_idx].ShareData(std::move(ptr), bytes, pinned, shape, type, device_id, order); // As the order was simply copied over, we have to fix it back. // We will be accessing it in order of this buffer, so we need to wait for all the work // from the "incoming" src order. @@ -460,13 +460,6 @@ std::vector TensorList::_chunks_capacity() const { return result; } - -template -const TensorListShape<> &TensorList::shape() const & { - return shape_; -} - - template void TensorList::set_order(AccessOrder order, bool synchronize) { DALI_ENFORCE(order, "Resetting order to an empty one is not supported"); @@ -529,6 +522,7 @@ void TensorList::Resize(const TensorListShape<> &new_shape, DALIDataTyp if (old_size < new_shape.num_samples()) { tensors_.resize(new_shape.num_samples()); } + for (int i = old_size; i < new_shape.num_samples(); i++) { setup_tensor_allocation(i); } @@ -575,6 +569,7 @@ void TensorList::Resize(const TensorListShape<> &new_shape, DALIDataTyp for (int i = 0; i < curr_num_tensors_; i++) { tensors_[i].Resize(new_shape[i], new_type); } + if (curr_num_tensors_ > 0) { order_ = tensors_[0].order(); device_ = tensors_[0].device_id(); @@ -629,19 +624,6 @@ void TensorList::set_type(DALIDataType new_type_id) { } } - -template -DALIDataType TensorList::type() const { - return type_.id(); -} - - -template -const TypeInfo &TensorList::type_info() const { - return type_; -} - - template void TensorList::SetLayout(const TensorLayout &layout) { for (auto &t : tensors_) { @@ -662,13 +644,6 @@ void TensorList::SetSourceInfo(int idx, const std::string &source_info) tensors_[idx].SetSourceInfo(source_info); } - -template -TensorLayout TensorList::GetLayout() const { - return layout_; -} - - template const DALIMeta &TensorList::GetMeta(int idx) const { assert(idx < curr_num_tensors_); @@ -695,13 +670,6 @@ void TensorList::set_pinned(bool pinned) { pinned_ = pinned; } - -template -bool TensorList::is_pinned() const { - return pinned_; -} - - template void TensorList::set_device_id(int device_id) { contiguous_buffer_.set_device_id(device_id); @@ -711,13 +679,6 @@ void TensorList::set_device_id(int device_id) { device_ = device_id; } - -template -int TensorList::device_id() const { - return device_; -} - - template void TensorList::reserve(size_t total_bytes) { int batch_size_bkp = curr_num_tensors_; @@ -744,30 +705,18 @@ void TensorList::reserve(size_t bytes_per_sample, int batch_size) { } } - -template -bool TensorList::IsContiguous() const noexcept { - return state_.IsContiguous(); -} - - -template -BatchContiguity TensorList::GetContiguity() const noexcept { - return state_.Get(); -} - - template void TensorList::recreate_views() { // precondition: type, shape are configured uint8_t *sample_ptr = static_cast(contiguous_buffer_.raw_mutable_data()); int64_t num_samples = shape().num_samples(); + auto &data_ptr = contiguous_buffer_.get_data_ptr(); for (int64_t i = 0; i < num_samples; i++) { // or any other way auto tensor_size = shape().tensor_size(i); - std::shared_ptr sample_alias(contiguous_buffer_.get_data_ptr(), sample_ptr); - tensors_[i].ShareData(sample_alias, tensor_size * type_info().size(), is_pinned(), shape()[i], + tensors_[i].ShareData(std::shared_ptr(data_ptr, sample_ptr), + tensor_size * type_info().size(), is_pinned(), shape()[i], type(), device_id(), order()); tensors_[i].SetLayout(GetLayout()); sample_ptr += tensor_size * type_info().size(); @@ -996,7 +945,8 @@ Tensor TensorList::AsReshapedTensor(const TensorShape<> &new_s ptr = nullptr; } - result.ShareData(ptr, capacity(), is_pinned(), new_shape, type(), device_id(), order()); + result.ShareData(std::move(ptr), capacity(), is_pinned(), + new_shape, type(), device_id(), order()); auto result_layout = GetLayout(); if (result_layout.ndim() + 1 == new_shape.sample_dim()) { @@ -1022,10 +972,11 @@ Tensor TensorList::AsTensor() { template -void TensorList::ShareData(const shared_ptr &ptr, size_t bytes, bool pinned, +void TensorList::ShareData(shared_ptr ptr, size_t bytes, bool pinned, const TensorListShape<> &shape, DALIDataType type, int device_id, AccessOrder order, const TensorLayout &layout) { - contiguous_buffer_.set_backing_allocation(ptr, bytes, pinned, type, shape.num_elements(), + contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned, + type, shape.num_elements(), device_id, order); buffer_bkp_.reset(); tensors_.clear(); diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h index 647089ee77..d39332f928 100644 --- a/dali/pipeline/data/tensor_list.h +++ b/dali/pipeline/data/tensor_list.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -143,7 +143,9 @@ class DLL_PUBLIC TensorList { /** * @brief Get the shape of the batch. */ - const TensorListShape<> &shape() const &; + const TensorListShape<> &shape() const & { + return shape_; + } /** * @brief Get the shape of the sample. @@ -273,7 +275,7 @@ class DLL_PUBLIC TensorList { * We wait for the order of incoming sample in the order of the batch to allow correctly ordered * access of the new sample. */ - DLL_PUBLIC void SetSample(int sample_idx, const shared_ptr &ptr, size_t bytes, bool pinned, + DLL_PUBLIC void SetSample(int sample_idx, shared_ptr ptr, size_t bytes, bool pinned, const TensorShape<> &shape, DALIDataType type, int device_id, AccessOrder order, const TensorLayout &layout = ""); /** @} */ @@ -325,14 +327,18 @@ class DLL_PUBLIC TensorList { /** * @brief Get the type of samples in the batch. */ - DALIDataType type() const; + DALIDataType type() const { + return type_.id(); + } /** * @brief Get the TypeInfo of samples in the batch. * * @note Using DALIDataType via type() is recommended over accessing type_info(). */ - const TypeInfo &type_info() const; + const TypeInfo &type_info() const { + return type_; + } /** @} */ /** @@ -428,7 +434,10 @@ class DLL_PUBLIC TensorList { /** * @brief If the batch is backed by contiguous buffer */ - bool IsContiguous() const noexcept; + bool IsContiguous() const noexcept { + return state_.IsContiguous(); + } + /** * @brief Pin the current state for further allocating calls like Resize() or set_type @@ -440,7 +449,9 @@ class DLL_PUBLIC TensorList { /** * @brief Check the batch contiguity state. */ - BatchContiguity GetContiguity() const noexcept; + BatchContiguity GetContiguity() const noexcept { + return state_.Get(); + } /** * @brief Coalesce from individual samples to a contiguous buffer if the conditions are met. @@ -472,7 +483,7 @@ class DLL_PUBLIC TensorList { /** * @brief Set the provided buffer as backing memory for this batch. */ - DLL_PUBLIC void ShareData(const shared_ptr &ptr, size_t bytes, bool pinned, + DLL_PUBLIC void ShareData(shared_ptr ptr, size_t bytes, bool pinned, const TensorListShape<> &shape, DALIDataType type, int device_id, AccessOrder order = {}, const TensorLayout &layout = ""); @@ -483,11 +494,15 @@ class DLL_PUBLIC TensorList { void set_pinned(bool pinned); - bool is_pinned() const; + bool is_pinned() const { + return pinned_; + } void set_device_id(int device_id); - int device_id() const; + int device_id() const { + return device_; + } bool has_data() const; @@ -531,7 +546,9 @@ class DLL_PUBLIC TensorList { /** * @brief Get the layout of the sample in the batch. */ - TensorLayout GetLayout() const; + TensorLayout GetLayout() const { + return layout_; + } /** * @brief Set cache metadata for given sample @@ -817,7 +834,7 @@ class DLL_PUBLIC TensorList { * Only allowed for contiguous batch, in typical scenario it is equivalent to * unsafe_sample_owner(batch, 0) */ - friend shared_ptr unsafe_owner(TensorList &batch) { + friend const shared_ptr &unsafe_owner(TensorList &batch) { DALI_ENFORCE(batch.IsContiguous(), "Data owner pointer can be obtain only for contiguous TensorList."); return batch.contiguous_buffer_.get_data_ptr(); diff --git a/dali/pipeline/data/tensor_list_test.cc b/dali/pipeline/data/tensor_list_test.cc index dff32d87c9..4cef10721b 100644 --- a/dali/pipeline/data/tensor_list_test.cc +++ b/dali/pipeline/data/tensor_list_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -34,6 +34,7 @@ #include "dali/pipeline/data/views.h" #include "dali/test/dali_test.h" #include "dali/test/tensor_test_utils.h" +#include "dali/test/timing.h" namespace dali { namespace test { @@ -1992,6 +1993,26 @@ TEST_F(TensorListVariableBatchSizeTest, UpdatePropertiesFromSamples) { tv.SetSample(1, tv.tensor_handle(2)); } +TEST(TensorList, ResizeOverheadPerf) { + cudaFree(0); + int niter = 20000; + int total_size = 256 << 10; + int nsamples = 1024; + auto shape = uniform_list_shape(nsamples, {total_size / nsamples}); + for (int i = 0; i < 5000; i++) { + TensorList tl; + tl.set_pinned(false); + tl.Resize(shape, DALI_UINT8); + } + auto start = perf_timer::now(); + for (int i = 0; i < niter; i++) { + TensorList tl; + tl.set_pinned(false); + tl.Resize(shape, DALI_UINT8); + } + auto end = perf_timer::now(); + std::cout << format_time((end - start) / niter) << std::endl; +} } // namespace test } // namespace dali