Optimize TensorList::Resize (#5638)

* simple inline functions are moved to the header * shared_ptr in ShareData is now passed by value, allowing move semantics and reducing the number of atomic operations * some code motion to improve inlining (e.g. wrapping frequent calls to DLL_PUBLIC functions into a trampoline function) --------- Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
NVIDIA · Sep 19, 2024 · f34a227 · f34a227
1 parent 94f02ad
commit f34a227
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 88 deletions.
diff --git a/dali/pipeline/data/buffer.cc b/dali/pipeline/data/buffer.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ DLL_PUBLIC shared_ptr<uint8_t> AllocBuffer(size_t bytes, bool pinned,
 }
 
 DLL_PUBLIC bool RestrictPinnedMemUsage() {
- static bool val = []() {
+ static const bool val = []() {
  const char *env = getenv("DALI_RESTRICT_PINNED_MEM");
  return env && atoi(env);
  }();

diff --git a/dali/pipeline/data/buffer.h b/dali/pipeline/data/buffer.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -302,7 +302,7 @@ class DLL_PUBLIC Buffer {
  return !!data_;
  }
 
- std::shared_ptr<void> get_data_ptr() const {
+ const std::shared_ptr<void> &get_data_ptr() const {
  return data_;
  }
 
@@ -549,7 +549,7 @@ class DLL_PUBLIC Buffer {
  *
  * @remark If order is empty, current order is used.
  */
- inline void set_backing_allocation(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+ inline void set_backing_allocation(shared_ptr<void> ptr, size_t bytes, bool pinned,
  DALIDataType type, size_t size, int device_id,
  AccessOrder order = {}) {
  if (!same_managed_object(data_, ptr))
@@ -562,7 +562,7 @@ class DLL_PUBLIC Buffer {
 
  // Fill the remaining members in the order as they appear in class.
  type_ = TypeTable::GetTypeInfo(type);
- data_ = ptr;
+ data_ = std::move(ptr);
  allocate_ = {};
  size_ = size;
  shares_data_ = data_ != nullptr;
@@ -674,7 +674,10 @@ class DLL_PUBLIC Buffer {
  static double growth_factor_;
  static double shrink_threshold_;
 
- static bool default_pinned();
+ static bool default_pinned() {
+ static const bool pinned = !RestrictPinnedMemUsage();
+ return pinned;
+ }
 
  TypeInfo type_ = {}; // Data type of underlying storage
  shared_ptr<void> data_ = nullptr; // Pointer to underlying storage
@@ -683,8 +686,8 @@ class DLL_PUBLIC Buffer {
  size_t num_bytes_ = 0; // To keep track of the true size of the underlying allocation
  int device_ = CPU_ONLY_DEVICE_ID; // device the buffer was allocated on
  AccessOrder order_ = AccessOrder::host(); // The order of memory access (host or device)
- bool shares_data_ = false;  // Whether we aren't using our own allocation
- bool pinned_ = !RestrictPinnedMemUsage(); // Whether the allocation uses pinned memory
+ bool shares_data_ = false; // Whether we aren't using our own allocation
+ bool pinned_ = default_pinned(); // Whether the allocation uses pinned memory
 };
 
 template <typename Backend>

diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -44,7 +44,6 @@ class Tensor : public Buffer<Backend> {
  inline Tensor() {}
  inline ~Tensor() override = default;
 
-
  /**
  *
  * @brief For tensor T of shape (s_0, s_1, ..., s_{n-1}) returns a n-1 dimensional tensor T'
@@ -226,7 +225,7 @@ class Tensor : public Buffer<Backend> {
  * individually. The device_id describes the location of the memory and the order can describe
  * the dependency on the work that is happening on another device.
  */
- inline void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+ inline void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
  const TensorShape<> &shape, DALIDataType type, int device_id,
  AccessOrder order = {}) {
  Index new_size = volume(shape);
@@ -243,7 +242,7 @@ class Tensor : public Buffer<Backend> {
 
  // Save our new pointer and bytes. Reset our type, shape, and size
  type_ = TypeTable::GetTypeInfo(type);
- data_ = ptr;
+ data_ = std::move(ptr);
  size_ = new_size;
  num_bytes_ = bytes;
  device_ = device_id;

diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -301,7 +301,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const Tensor<Backend> &owner
 
 
 template <typename Backend>
-void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes,
+void TensorList<Backend>::SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes,
  bool pinned, const TensorShape<> &shape, DALIDataType type,
  int device_id, AccessOrder order, const TensorLayout &layout) {
  // Bounds check
@@ -316,7 +316,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr,
 
  // Setting a new share overwrites the previous one - so we can safely assume that even if
  // we had a sample sharing into TL, it will be overwritten
- tensors_[sample_idx].ShareData(ptr, bytes, pinned, shape, type, device_id, order);
+ tensors_[sample_idx].ShareData(std::move(ptr), bytes, pinned, shape, type, device_id, order);
  // As the order was simply copied over, we have to fix it back.
  // We will be accessing it in order of this buffer, so we need to wait for all the work
  // from the "incoming" src order.
@@ -460,13 +460,6 @@ std::vector<size_t> TensorList<Backend>::_chunks_capacity() const {
  return result;
 }
 
-
-template <typename Backend>
-const TensorListShape<> &TensorList<Backend>::shape() const & {
- return shape_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::set_order(AccessOrder order, bool synchronize) {
  DALI_ENFORCE(order, "Resetting order to an empty one is not supported");
@@ -529,6 +522,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
  if (old_size < new_shape.num_samples()) {
  tensors_.resize(new_shape.num_samples());
  }
+
  for (int i = old_size; i < new_shape.num_samples(); i++) {
  setup_tensor_allocation(i);
  }
@@ -575,6 +569,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
  for (int i = 0; i < curr_num_tensors_; i++) {
  tensors_[i].Resize(new_shape[i], new_type);
  }
+
  if (curr_num_tensors_ > 0) {
  order_ = tensors_[0].order();
  device_ = tensors_[0].device_id();
@@ -629,19 +624,6 @@ void TensorList<Backend>::set_type(DALIDataType new_type_id) {
  }
 }
 
-
-template <typename Backend>
-DALIDataType TensorList<Backend>::type() const {
- return type_.id();
-}
-
-
-template <typename Backend>
-const TypeInfo &TensorList<Backend>::type_info() const {
- return type_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::SetLayout(const TensorLayout &layout) {
  for (auto &t : tensors_) {
@@ -662,13 +644,6 @@ void TensorList<Backend>::SetSourceInfo(int idx, const std::string &source_info)
  tensors_[idx].SetSourceInfo(source_info);
 }
 
-
-template <typename Backend>
-TensorLayout TensorList<Backend>::GetLayout() const {
- return layout_;
-}
-
-
 template <typename Backend>
 const DALIMeta &TensorList<Backend>::GetMeta(int idx) const {
  assert(idx < curr_num_tensors_);
@@ -695,13 +670,6 @@ void TensorList<Backend>::set_pinned(bool pinned) {
  pinned_ = pinned;
 }
 
-
-template <typename Backend>
-bool TensorList<Backend>::is_pinned() const {
- return pinned_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::set_device_id(int device_id) {
  contiguous_buffer_.set_device_id(device_id);
@@ -711,13 +679,6 @@ void TensorList<Backend>::set_device_id(int device_id) {
  device_ = device_id;
 }
 
-
-template <typename Backend>
-int TensorList<Backend>::device_id() const {
- return device_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::reserve(size_t total_bytes) {
  int batch_size_bkp = curr_num_tensors_;
@@ -744,30 +705,18 @@ void TensorList<Backend>::reserve(size_t bytes_per_sample, int batch_size) {
  }
 }
 
-
-template <typename Backend>
-bool TensorList<Backend>::IsContiguous() const noexcept {
- return state_.IsContiguous();
-}
-
-
-template <typename Backend>
-BatchContiguity TensorList<Backend>::GetContiguity() const noexcept {
- return state_.Get();
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::recreate_views() {
  // precondition: type, shape are configured
  uint8_t *sample_ptr = static_cast<uint8_t *>(contiguous_buffer_.raw_mutable_data());
  int64_t num_samples = shape().num_samples();
+ auto &data_ptr = contiguous_buffer_.get_data_ptr();
  for (int64_t i = 0; i < num_samples; i++) {
  // or any other way
  auto tensor_size = shape().tensor_size(i);
 
- std::shared_ptr<void> sample_alias(contiguous_buffer_.get_data_ptr(), sample_ptr);
- tensors_[i].ShareData(sample_alias, tensor_size * type_info().size(), is_pinned(), shape()[i],
+ tensors_[i].ShareData(std::shared_ptr<void>(data_ptr, sample_ptr),
+   tensor_size * type_info().size(), is_pinned(), shape()[i],
  type(), device_id(), order());
  tensors_[i].SetLayout(GetLayout());
  sample_ptr += tensor_size * type_info().size();
@@ -996,7 +945,8 @@ Tensor<Backend> TensorList<Backend>::AsReshapedTensor(const TensorShape<> &new_s
  ptr = nullptr;
  }
 
- result.ShareData(ptr, capacity(), is_pinned(), new_shape, type(), device_id(), order());
+ result.ShareData(std::move(ptr), capacity(), is_pinned(),
+ new_shape, type(), device_id(), order());
 
  auto result_layout = GetLayout();
  if (result_layout.ndim() + 1 == new_shape.sample_dim()) {
@@ -1022,10 +972,11 @@ Tensor<Backend> TensorList<Backend>::AsTensor() {
 
 
 template <typename Backend>
-void TensorList<Backend>::ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
  const TensorListShape<> &shape, DALIDataType type,
  int device_id, AccessOrder order, const TensorLayout &layout) {
- contiguous_buffer_.set_backing_allocation(ptr, bytes, pinned, type, shape.num_elements(),
+ contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned,
+ type, shape.num_elements(),
  device_id, order);
  buffer_bkp_.reset();
  tensors_.clear();

diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -143,7 +143,9 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief Get the shape of the batch.
  */
- const TensorListShape<> &shape() const &;
+ const TensorListShape<> &shape() const & {
+ return shape_;
+ }
 
  /**
  * @brief Get the shape of the sample.
@@ -273,7 +275,7 @@ class DLL_PUBLIC TensorList {
  * We wait for the order of incoming sample in the order of the batch to allow correctly ordered
  * access of the new sample.
  */
- DLL_PUBLIC void SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+ DLL_PUBLIC void SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes, bool pinned,
  const TensorShape<> &shape, DALIDataType type, int device_id,
  AccessOrder order, const TensorLayout &layout = "");
  /** @} */
@@ -325,14 +327,18 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief Get the type of samples in the batch.
  */
- DALIDataType type() const;
+ DALIDataType type() const {
+ return type_.id();
+ }
 
  /**
  * @brief Get the TypeInfo of samples in the batch.
  *
  * @note Using DALIDataType via type() is recommended over accessing type_info().
  */
- const TypeInfo &type_info() const;
+ const TypeInfo &type_info() const {
+ return type_;
+ }
  /** @} */
 
  /**
@@ -428,7 +434,10 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief If the batch is backed by contiguous buffer
  */
- bool IsContiguous() const noexcept;
+ bool IsContiguous() const noexcept {
+ return state_.IsContiguous();
+ }
+
 
  /**
  * @brief Pin the current state for further allocating calls like Resize() or set_type
@@ -440,7 +449,9 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief Check the batch contiguity state.
  */
- BatchContiguity GetContiguity() const noexcept;
+ BatchContiguity GetContiguity() const noexcept {
+ return state_.Get();
+ }
 
  /**
  * @brief Coalesce from individual samples to a contiguous buffer if the conditions are met.
@@ -472,7 +483,7 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief Set the provided buffer as backing memory for this batch.
  */
- DLL_PUBLIC void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+ DLL_PUBLIC void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
  const TensorListShape<> &shape, DALIDataType type, int device_id,
  AccessOrder order = {}, const TensorLayout &layout = "");
 
@@ -483,11 +494,15 @@ class DLL_PUBLIC TensorList {
 
  void set_pinned(bool pinned);
 
- bool is_pinned() const;
+ bool is_pinned() const {
+ return pinned_;
+ }
 
  void set_device_id(int device_id);
 
- int device_id() const;
+ int device_id() const {
+ return device_;
+ }
 
  bool has_data() const;
 
@@ -531,7 +546,9 @@ class DLL_PUBLIC TensorList {
  /**
  * @brief Get the layout of the sample in the batch.
  */
- TensorLayout GetLayout() const;
+ TensorLayout GetLayout() const {
+ return layout_;
+ }
 
  /**
  * @brief Set cache metadata for given sample
@@ -817,7 +834,7 @@ class DLL_PUBLIC TensorList {
  * Only allowed for contiguous batch, in typical scenario it is equivalent to
  * unsafe_sample_owner(batch, 0)
  */
- friend shared_ptr<void> unsafe_owner(TensorList<Backend> &batch) {
+ friend const shared_ptr<void> &unsafe_owner(TensorList<Backend> &batch) {
  DALI_ENFORCE(batch.IsContiguous(),
  "Data owner pointer can be obtain only for contiguous TensorList.");
  return batch.contiguous_buffer_.get_data_ptr();