From f34a2270b4dea7f8bce25f5ca8028e01385df1a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Thu, 19 Sep 2024 17:34:42 +0200
Subject: [PATCH] Optimize TensorList::Resize (#5638)

* simple inline functions are moved to the header
* shared_ptr in ShareData is now passed by value, allowing move semantics and reducing the number of atomic operations
* some code motion to improve inlining (e.g. wrapping frequent calls to DLL_PUBLIC functions into a trampoline function)

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/pipeline/data/buffer.cc           |  4 +-
 dali/pipeline/data/buffer.h            | 17 +++---
 dali/pipeline/data/tensor.h            |  7 ++-
 dali/pipeline/data/tensor_list.cc      | 75 +++++---------------------
 dali/pipeline/data/tensor_list.h       | 41 +++++++++-----
 dali/pipeline/data/tensor_list_test.cc | 23 +++++++-
 6 files changed, 79 insertions(+), 88 deletions(-)
diff --git a/dali/pipeline/data/buffer.cc b/dali/pipeline/data/buffer.cc
index 5d056407fd..0244f4595f 100644
--- a/dali/pipeline/data/buffer.cc
+++ b/dali/pipeline/data/buffer.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ DLL_PUBLIC shared_ptr<uint8_t> AllocBuffer(size_t bytes, bool pinned,
 }
 
 DLL_PUBLIC bool RestrictPinnedMemUsage() {
-  static bool val = []() {
+  static const bool val = []() {
     const char *env = getenv("DALI_RESTRICT_PINNED_MEM");
     return env && atoi(env);
   }();
diff --git a/dali/pipeline/data/buffer.h b/dali/pipeline/data/buffer.h
index 4aebebb171..93a21b8c09 100644
--- a/dali/pipeline/data/buffer.h
+++ b/dali/pipeline/data/buffer.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -302,7 +302,7 @@ class DLL_PUBLIC Buffer {
     return !!data_;
   }
 
-  std::shared_ptr<void> get_data_ptr() const {
+  const std::shared_ptr<void> &get_data_ptr() const {
     return data_;
   }
 
@@ -549,7 +549,7 @@ class DLL_PUBLIC Buffer {
    *
    * @remark If order is empty, current order is used.
    */
-  inline void set_backing_allocation(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+  inline void set_backing_allocation(shared_ptr<void> ptr, size_t bytes, bool pinned,
                                      DALIDataType type, size_t size, int device_id,
                                      AccessOrder order = {}) {
     if (!same_managed_object(data_, ptr))
@@ -562,7 +562,7 @@ class DLL_PUBLIC Buffer {
 
     // Fill the remaining members in the order as they appear in class.
     type_ = TypeTable::GetTypeInfo(type);
-    data_ = ptr;
+    data_ = std::move(ptr);
     allocate_ = {};
     size_ = size;
     shares_data_ = data_ != nullptr;
@@ -674,7 +674,10 @@ class DLL_PUBLIC Buffer {
   static double growth_factor_;
   static double shrink_threshold_;
 
-  static bool default_pinned();
+  static bool default_pinned() {
+    static const bool pinned = !RestrictPinnedMemUsage();
+    return pinned;
+  }
 
   TypeInfo type_ = {};               // Data type of underlying storage
   shared_ptr<void> data_ = nullptr;  // Pointer to underlying storage
@@ -683,8 +686,8 @@ class DLL_PUBLIC Buffer {
   size_t num_bytes_ = 0;             // To keep track of the true size of the underlying allocation
   int device_ = CPU_ONLY_DEVICE_ID;  // device the buffer was allocated on
   AccessOrder order_ = AccessOrder::host();   // The order of memory access (host or device)
-  bool shares_data_ = false;                  // Whether we aren't using our own allocation
-  bool pinned_ = !RestrictPinnedMemUsage();   // Whether the allocation uses pinned memory
+  bool shares_data_ = false;         // Whether we aren't using our own allocation
+  bool pinned_ = default_pinned();   // Whether the allocation uses pinned memory
 };
 
 template <typename Backend>
diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h
index 5c25939a3b..3088b4365c 100644
--- a/dali/pipeline/data/tensor.h
+++ b/dali/pipeline/data/tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -44,7 +44,6 @@ class Tensor : public Buffer<Backend> {
   inline Tensor() {}
   inline ~Tensor() override = default;
 
-
   /**
    *
    * @brief For tensor T of shape (s_0, s_1, ..., s_{n-1}) returns a n-1 dimensional tensor T'
@@ -226,7 +225,7 @@ class Tensor : public Buffer<Backend> {
    * individually. The device_id describes the location of the memory and the order can describe
    * the dependency on the work that is happening on another device.
    */
-  inline void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+  inline void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                         const TensorShape<> &shape, DALIDataType type, int device_id,
                         AccessOrder order = {}) {
     Index new_size = volume(shape);
@@ -243,7 +242,7 @@ class Tensor : public Buffer<Backend> {
 
     // Save our new pointer and bytes. Reset our type, shape, and size
     type_ = TypeTable::GetTypeInfo(type);
-    data_ = ptr;
+    data_ = std::move(ptr);
     size_ = new_size;
     num_bytes_ = bytes;
     device_ = device_id;
diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc
index 8fd0d0f22a..13c33bd809 100644
--- a/dali/pipeline/data/tensor_list.cc
+++ b/dali/pipeline/data/tensor_list.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -301,7 +301,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const Tensor<Backend> &owner
 
 
 template <typename Backend>
-void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes,
+void TensorList<Backend>::SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes,
                                     bool pinned, const TensorShape<> &shape, DALIDataType type,
                                     int device_id, AccessOrder order, const TensorLayout &layout) {
   // Bounds check
@@ -316,7 +316,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr,
 
   // Setting a new share overwrites the previous one - so we can safely assume that even if
   // we had a sample sharing into TL, it will be overwritten
-  tensors_[sample_idx].ShareData(ptr, bytes, pinned, shape, type, device_id, order);
+  tensors_[sample_idx].ShareData(std::move(ptr), bytes, pinned, shape, type, device_id, order);
   // As the order was simply copied over, we have to fix it back.
   // We will be accessing it in order of this buffer, so we need to wait for all the work
   // from the "incoming" src order.
@@ -460,13 +460,6 @@ std::vector<size_t> TensorList<Backend>::_chunks_capacity() const {
   return result;
 }
 
-
-template <typename Backend>
-const TensorListShape<> &TensorList<Backend>::shape() const & {
-  return shape_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::set_order(AccessOrder order, bool synchronize) {
   DALI_ENFORCE(order, "Resetting order to an empty one is not supported");
@@ -529,6 +522,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
   if (old_size < new_shape.num_samples()) {
     tensors_.resize(new_shape.num_samples());
   }
+
   for (int i = old_size; i < new_shape.num_samples(); i++) {
     setup_tensor_allocation(i);
   }
@@ -575,6 +569,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
   for (int i = 0; i < curr_num_tensors_; i++) {
     tensors_[i].Resize(new_shape[i], new_type);
   }
+
   if (curr_num_tensors_ > 0) {
     order_ = tensors_[0].order();
     device_ = tensors_[0].device_id();
@@ -629,19 +624,6 @@ void TensorList<Backend>::set_type(DALIDataType new_type_id) {
   }
 }
 
-
-template <typename Backend>
-DALIDataType TensorList<Backend>::type() const {
-  return type_.id();
-}
-
-
-template <typename Backend>
-const TypeInfo &TensorList<Backend>::type_info() const {
-  return type_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::SetLayout(const TensorLayout &layout) {
   for (auto &t : tensors_) {
@@ -662,13 +644,6 @@ void TensorList<Backend>::SetSourceInfo(int idx, const std::string &source_info)
   tensors_[idx].SetSourceInfo(source_info);
 }
 
-
-template <typename Backend>
-TensorLayout TensorList<Backend>::GetLayout() const {
-  return layout_;
-}
-
-
 template <typename Backend>
 const DALIMeta &TensorList<Backend>::GetMeta(int idx) const {
   assert(idx < curr_num_tensors_);
@@ -695,13 +670,6 @@ void TensorList<Backend>::set_pinned(bool pinned) {
   pinned_ = pinned;
 }
 
-
-template <typename Backend>
-bool TensorList<Backend>::is_pinned() const {
-  return pinned_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::set_device_id(int device_id) {
   contiguous_buffer_.set_device_id(device_id);
@@ -711,13 +679,6 @@ void TensorList<Backend>::set_device_id(int device_id) {
   device_ = device_id;
 }
 
-
-template <typename Backend>
-int TensorList<Backend>::device_id() const {
-  return device_;
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::reserve(size_t total_bytes) {
   int batch_size_bkp = curr_num_tensors_;
@@ -744,30 +705,18 @@ void TensorList<Backend>::reserve(size_t bytes_per_sample, int batch_size) {
   }
 }
 
-
-template <typename Backend>
-bool TensorList<Backend>::IsContiguous() const noexcept {
-  return state_.IsContiguous();
-}
-
-
-template <typename Backend>
-BatchContiguity TensorList<Backend>::GetContiguity() const noexcept {
-  return state_.Get();
-}
-
-
 template <typename Backend>
 void TensorList<Backend>::recreate_views() {
   // precondition: type, shape are configured
   uint8_t *sample_ptr = static_cast<uint8_t *>(contiguous_buffer_.raw_mutable_data());
   int64_t num_samples = shape().num_samples();
+  auto &data_ptr = contiguous_buffer_.get_data_ptr();
   for (int64_t i = 0; i < num_samples; i++) {
     // or any other way
     auto tensor_size = shape().tensor_size(i);
 
-    std::shared_ptr<void> sample_alias(contiguous_buffer_.get_data_ptr(), sample_ptr);
-    tensors_[i].ShareData(sample_alias, tensor_size * type_info().size(), is_pinned(), shape()[i],
+    tensors_[i].ShareData(std::shared_ptr<void>(data_ptr, sample_ptr),
+                          tensor_size * type_info().size(), is_pinned(), shape()[i],
                           type(), device_id(), order());
     tensors_[i].SetLayout(GetLayout());
     sample_ptr += tensor_size * type_info().size();
@@ -996,7 +945,8 @@ Tensor<Backend> TensorList<Backend>::AsReshapedTensor(const TensorShape<> &new_s
     ptr = nullptr;
   }
 
-  result.ShareData(ptr, capacity(), is_pinned(), new_shape, type(), device_id(), order());
+  result.ShareData(std::move(ptr), capacity(), is_pinned(),
+                   new_shape, type(), device_id(), order());
 
   auto result_layout = GetLayout();
   if (result_layout.ndim() + 1 == new_shape.sample_dim()) {
@@ -1022,10 +972,11 @@ Tensor<Backend> TensorList<Backend>::AsTensor() {
 
 
 template <typename Backend>
-void TensorList<Backend>::ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                                     const TensorListShape<> &shape, DALIDataType type,
                                     int device_id, AccessOrder order, const TensorLayout &layout) {
-  contiguous_buffer_.set_backing_allocation(ptr, bytes, pinned, type, shape.num_elements(),
+  contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned,
+                                            type, shape.num_elements(),
                                             device_id, order);
   buffer_bkp_.reset();
   tensors_.clear();
diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
index 647089ee77..d39332f928 100644
--- a/dali/pipeline/data/tensor_list.h
+++ b/dali/pipeline/data/tensor_list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -143,7 +143,9 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief Get the shape of the batch.
    */
-  const TensorListShape<> &shape() const &;
+  const TensorListShape<> &shape() const & {
+    return shape_;
+  }
 
   /**
    * @brief Get the shape of the sample.
@@ -273,7 +275,7 @@ class DLL_PUBLIC TensorList {
    * We wait for the order of incoming sample in the order of the batch to allow correctly ordered
    * access of the new sample.
    */
-  DLL_PUBLIC void SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+  DLL_PUBLIC void SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes, bool pinned,
                             const TensorShape<> &shape, DALIDataType type, int device_id,
                             AccessOrder order, const TensorLayout &layout = "");
   /** @} */
@@ -325,14 +327,18 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief Get the type of samples in the batch.
    */
-  DALIDataType type() const;
+  DALIDataType type() const {
+    return type_.id();
+  }
 
   /**
    * @brief Get the TypeInfo of samples in the batch.
    *
    * @note Using DALIDataType via type() is recommended over accessing type_info().
    */
-  const TypeInfo &type_info() const;
+  const TypeInfo &type_info() const {
+    return type_;
+  }
   /** @} */
 
   /**
@@ -428,7 +434,10 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief If the batch is backed by contiguous buffer
    */
-  bool IsContiguous() const noexcept;
+  bool IsContiguous() const noexcept {
+    return state_.IsContiguous();
+  }
+
 
   /**
    * @brief Pin the current state for further allocating calls like Resize() or set_type
@@ -440,7 +449,9 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief Check the batch contiguity state.
    */
-  BatchContiguity GetContiguity() const noexcept;
+  BatchContiguity GetContiguity() const noexcept {
+    return state_.Get();
+  }
 
   /**
    * @brief Coalesce from individual samples to a contiguous buffer if the conditions are met.
@@ -472,7 +483,7 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief Set the provided buffer as backing memory for this batch.
    */
-  DLL_PUBLIC void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
+  DLL_PUBLIC void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                             const TensorListShape<> &shape, DALIDataType type, int device_id,
                             AccessOrder order = {}, const TensorLayout &layout = "");
 
@@ -483,11 +494,15 @@ class DLL_PUBLIC TensorList {
 
   void set_pinned(bool pinned);
 
-  bool is_pinned() const;
+  bool is_pinned() const {
+    return pinned_;
+  }
 
   void set_device_id(int device_id);
 
-  int device_id() const;
+  int device_id() const {
+    return device_;
+  }
 
   bool has_data() const;
 
@@ -531,7 +546,9 @@ class DLL_PUBLIC TensorList {
   /**
    * @brief Get the layout of the sample in the batch.
    */
-  TensorLayout GetLayout() const;
+  TensorLayout GetLayout() const {
+    return layout_;
+  }
 
   /**
    * @brief Set cache metadata for given sample
@@ -817,7 +834,7 @@ class DLL_PUBLIC TensorList {
    * Only allowed for contiguous batch, in typical scenario it is equivalent to
    * unsafe_sample_owner(batch, 0)
    */
-  friend shared_ptr<void> unsafe_owner(TensorList<Backend> &batch) {
+  friend const shared_ptr<void> &unsafe_owner(TensorList<Backend> &batch) {
     DALI_ENFORCE(batch.IsContiguous(),
                  "Data owner pointer can be obtain only for contiguous TensorList.");
     return batch.contiguous_buffer_.get_data_ptr();
diff --git a/dali/pipeline/data/tensor_list_test.cc b/dali/pipeline/data/tensor_list_test.cc
index dff32d87c9..4cef10721b 100644
--- a/dali/pipeline/data/tensor_list_test.cc
+++ b/dali/pipeline/data/tensor_list_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@
 #include "dali/pipeline/data/views.h"
 #include "dali/test/dali_test.h"
 #include "dali/test/tensor_test_utils.h"
+#include "dali/test/timing.h"
 
 namespace dali {
 namespace test {
@@ -1992,6 +1993,26 @@ TEST_F(TensorListVariableBatchSizeTest, UpdatePropertiesFromSamples) {
   tv.SetSample(1, tv.tensor_handle(2));
 }
 
+TEST(TensorList, ResizeOverheadPerf) {
+  cudaFree(0);
+  int niter = 20000;
+  int total_size = 256 << 10;
+  int nsamples = 1024;
+  auto shape = uniform_list_shape(nsamples, {total_size / nsamples});
+  for (int i = 0; i < 5000; i++) {
+    TensorList<CPUBackend> tl;
+    tl.set_pinned(false);
+    tl.Resize(shape, DALI_UINT8);
+  }
+  auto start = perf_timer::now();
+  for (int i = 0; i < niter; i++) {
+    TensorList<CPUBackend> tl;
+    tl.set_pinned(false);
+    tl.Resize(shape, DALI_UINT8);
+  }
+  auto end = perf_timer::now();
+  std::cout << format_time((end - start) / niter) << std::endl;
+}
 
 }  // namespace test
 }  // namespace dali