Skip to content

Commit

Permalink
Optimize TensorList::Resize (#5638)
Browse files Browse the repository at this point in the history
* simple inline functions are moved to the header
* shared_ptr in ShareData is now passed by value, allowing move semantics and reducing the number of atomic operations
* some code motion to improve inlining (e.g. wrapping frequent calls to DLL_PUBLIC functions into a trampoline function)

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
  • Loading branch information
mzient authored Sep 19, 2024
1 parent 94f02ad commit f34a227
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 88 deletions.
4 changes: 2 additions & 2 deletions dali/pipeline/data/buffer.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,7 +63,7 @@ DLL_PUBLIC shared_ptr<uint8_t> AllocBuffer(size_t bytes, bool pinned,
}

DLL_PUBLIC bool RestrictPinnedMemUsage() {
static bool val = []() {
static const bool val = []() {
const char *env = getenv("DALI_RESTRICT_PINNED_MEM");
return env && atoi(env);
}();
Expand Down
17 changes: 10 additions & 7 deletions dali/pipeline/data/buffer.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -302,7 +302,7 @@ class DLL_PUBLIC Buffer {
return !!data_;
}

std::shared_ptr<void> get_data_ptr() const {
const std::shared_ptr<void> &get_data_ptr() const {
return data_;
}

Expand Down Expand Up @@ -549,7 +549,7 @@ class DLL_PUBLIC Buffer {
*
* @remark If order is empty, current order is used.
*/
inline void set_backing_allocation(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
inline void set_backing_allocation(shared_ptr<void> ptr, size_t bytes, bool pinned,
DALIDataType type, size_t size, int device_id,
AccessOrder order = {}) {
if (!same_managed_object(data_, ptr))
Expand All @@ -562,7 +562,7 @@ class DLL_PUBLIC Buffer {

// Fill the remaining members in the order as they appear in class.
type_ = TypeTable::GetTypeInfo(type);
data_ = ptr;
data_ = std::move(ptr);
allocate_ = {};
size_ = size;
shares_data_ = data_ != nullptr;
Expand Down Expand Up @@ -674,7 +674,10 @@ class DLL_PUBLIC Buffer {
static double growth_factor_;
static double shrink_threshold_;

static bool default_pinned();
static bool default_pinned() {
static const bool pinned = !RestrictPinnedMemUsage();
return pinned;
}

TypeInfo type_ = {}; // Data type of underlying storage
shared_ptr<void> data_ = nullptr; // Pointer to underlying storage
Expand All @@ -683,8 +686,8 @@ class DLL_PUBLIC Buffer {
size_t num_bytes_ = 0; // To keep track of the true size of the underlying allocation
int device_ = CPU_ONLY_DEVICE_ID; // device the buffer was allocated on
AccessOrder order_ = AccessOrder::host(); // The order of memory access (host or device)
bool shares_data_ = false; // Whether we aren't using our own allocation
bool pinned_ = !RestrictPinnedMemUsage(); // Whether the allocation uses pinned memory
bool shares_data_ = false; // Whether we aren't using our own allocation
bool pinned_ = default_pinned(); // Whether the allocation uses pinned memory
};

template <typename Backend>
Expand Down
7 changes: 3 additions & 4 deletions dali/pipeline/data/tensor.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,6 @@ class Tensor : public Buffer<Backend> {
inline Tensor() {}
inline ~Tensor() override = default;


/**
*
* @brief For tensor T of shape (s_0, s_1, ..., s_{n-1}) returns a n-1 dimensional tensor T'
Expand Down Expand Up @@ -226,7 +225,7 @@ class Tensor : public Buffer<Backend> {
* individually. The device_id describes the location of the memory and the order can describe
* the dependency on the work that is happening on another device.
*/
inline void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
inline void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorShape<> &shape, DALIDataType type, int device_id,
AccessOrder order = {}) {
Index new_size = volume(shape);
Expand All @@ -243,7 +242,7 @@ class Tensor : public Buffer<Backend> {

// Save our new pointer and bytes. Reset our type, shape, and size
type_ = TypeTable::GetTypeInfo(type);
data_ = ptr;
data_ = std::move(ptr);
size_ = new_size;
num_bytes_ = bytes;
device_ = device_id;
Expand Down
75 changes: 13 additions & 62 deletions dali/pipeline/data/tensor_list.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -301,7 +301,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const Tensor<Backend> &owner


template <typename Backend>
void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes,
void TensorList<Backend>::SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes,
bool pinned, const TensorShape<> &shape, DALIDataType type,
int device_id, AccessOrder order, const TensorLayout &layout) {
// Bounds check
Expand All @@ -316,7 +316,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr,

// Setting a new share overwrites the previous one - so we can safely assume that even if
// we had a sample sharing into TL, it will be overwritten
tensors_[sample_idx].ShareData(ptr, bytes, pinned, shape, type, device_id, order);
tensors_[sample_idx].ShareData(std::move(ptr), bytes, pinned, shape, type, device_id, order);
// As the order was simply copied over, we have to fix it back.
// We will be accessing it in order of this buffer, so we need to wait for all the work
// from the "incoming" src order.
Expand Down Expand Up @@ -460,13 +460,6 @@ std::vector<size_t> TensorList<Backend>::_chunks_capacity() const {
return result;
}


template <typename Backend>
const TensorListShape<> &TensorList<Backend>::shape() const & {
return shape_;
}


template <typename Backend>
void TensorList<Backend>::set_order(AccessOrder order, bool synchronize) {
DALI_ENFORCE(order, "Resetting order to an empty one is not supported");
Expand Down Expand Up @@ -529,6 +522,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
if (old_size < new_shape.num_samples()) {
tensors_.resize(new_shape.num_samples());
}

for (int i = old_size; i < new_shape.num_samples(); i++) {
setup_tensor_allocation(i);
}
Expand Down Expand Up @@ -575,6 +569,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
for (int i = 0; i < curr_num_tensors_; i++) {
tensors_[i].Resize(new_shape[i], new_type);
}

if (curr_num_tensors_ > 0) {
order_ = tensors_[0].order();
device_ = tensors_[0].device_id();
Expand Down Expand Up @@ -629,19 +624,6 @@ void TensorList<Backend>::set_type(DALIDataType new_type_id) {
}
}


template <typename Backend>
DALIDataType TensorList<Backend>::type() const {
return type_.id();
}


template <typename Backend>
const TypeInfo &TensorList<Backend>::type_info() const {
return type_;
}


template <typename Backend>
void TensorList<Backend>::SetLayout(const TensorLayout &layout) {
for (auto &t : tensors_) {
Expand All @@ -662,13 +644,6 @@ void TensorList<Backend>::SetSourceInfo(int idx, const std::string &source_info)
tensors_[idx].SetSourceInfo(source_info);
}


template <typename Backend>
TensorLayout TensorList<Backend>::GetLayout() const {
return layout_;
}


template <typename Backend>
const DALIMeta &TensorList<Backend>::GetMeta(int idx) const {
assert(idx < curr_num_tensors_);
Expand All @@ -695,13 +670,6 @@ void TensorList<Backend>::set_pinned(bool pinned) {
pinned_ = pinned;
}


template <typename Backend>
bool TensorList<Backend>::is_pinned() const {
return pinned_;
}


template <typename Backend>
void TensorList<Backend>::set_device_id(int device_id) {
contiguous_buffer_.set_device_id(device_id);
Expand All @@ -711,13 +679,6 @@ void TensorList<Backend>::set_device_id(int device_id) {
device_ = device_id;
}


template <typename Backend>
int TensorList<Backend>::device_id() const {
return device_;
}


template <typename Backend>
void TensorList<Backend>::reserve(size_t total_bytes) {
int batch_size_bkp = curr_num_tensors_;
Expand All @@ -744,30 +705,18 @@ void TensorList<Backend>::reserve(size_t bytes_per_sample, int batch_size) {
}
}


template <typename Backend>
bool TensorList<Backend>::IsContiguous() const noexcept {
return state_.IsContiguous();
}


template <typename Backend>
BatchContiguity TensorList<Backend>::GetContiguity() const noexcept {
return state_.Get();
}


template <typename Backend>
void TensorList<Backend>::recreate_views() {
// precondition: type, shape are configured
uint8_t *sample_ptr = static_cast<uint8_t *>(contiguous_buffer_.raw_mutable_data());
int64_t num_samples = shape().num_samples();
auto &data_ptr = contiguous_buffer_.get_data_ptr();
for (int64_t i = 0; i < num_samples; i++) {
// or any other way
auto tensor_size = shape().tensor_size(i);

std::shared_ptr<void> sample_alias(contiguous_buffer_.get_data_ptr(), sample_ptr);
tensors_[i].ShareData(sample_alias, tensor_size * type_info().size(), is_pinned(), shape()[i],
tensors_[i].ShareData(std::shared_ptr<void>(data_ptr, sample_ptr),
tensor_size * type_info().size(), is_pinned(), shape()[i],
type(), device_id(), order());
tensors_[i].SetLayout(GetLayout());
sample_ptr += tensor_size * type_info().size();
Expand Down Expand Up @@ -996,7 +945,8 @@ Tensor<Backend> TensorList<Backend>::AsReshapedTensor(const TensorShape<> &new_s
ptr = nullptr;
}

result.ShareData(ptr, capacity(), is_pinned(), new_shape, type(), device_id(), order());
result.ShareData(std::move(ptr), capacity(), is_pinned(),
new_shape, type(), device_id(), order());

auto result_layout = GetLayout();
if (result_layout.ndim() + 1 == new_shape.sample_dim()) {
Expand All @@ -1022,10 +972,11 @@ Tensor<Backend> TensorList<Backend>::AsTensor() {


template <typename Backend>
void TensorList<Backend>::ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorListShape<> &shape, DALIDataType type,
int device_id, AccessOrder order, const TensorLayout &layout) {
contiguous_buffer_.set_backing_allocation(ptr, bytes, pinned, type, shape.num_elements(),
contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned,
type, shape.num_elements(),
device_id, order);
buffer_bkp_.reset();
tensors_.clear();
Expand Down
41 changes: 29 additions & 12 deletions dali/pipeline/data/tensor_list.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -143,7 +143,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the shape of the batch.
*/
const TensorListShape<> &shape() const &;
const TensorListShape<> &shape() const & {
return shape_;
}

/**
* @brief Get the shape of the sample.
Expand Down Expand Up @@ -273,7 +275,7 @@ class DLL_PUBLIC TensorList {
* We wait for the order of incoming sample in the order of the batch to allow correctly ordered
* access of the new sample.
*/
DLL_PUBLIC void SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes, bool pinned,
DLL_PUBLIC void SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorShape<> &shape, DALIDataType type, int device_id,
AccessOrder order, const TensorLayout &layout = "");
/** @} */
Expand Down Expand Up @@ -325,14 +327,18 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the type of samples in the batch.
*/
DALIDataType type() const;
DALIDataType type() const {
return type_.id();
}

/**
* @brief Get the TypeInfo of samples in the batch.
*
* @note Using DALIDataType via type() is recommended over accessing type_info().
*/
const TypeInfo &type_info() const;
const TypeInfo &type_info() const {
return type_;
}
/** @} */

/**
Expand Down Expand Up @@ -428,7 +434,10 @@ class DLL_PUBLIC TensorList {
/**
* @brief If the batch is backed by contiguous buffer
*/
bool IsContiguous() const noexcept;
bool IsContiguous() const noexcept {
return state_.IsContiguous();
}


/**
* @brief Pin the current state for further allocating calls like Resize() or set_type
Expand All @@ -440,7 +449,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Check the batch contiguity state.
*/
BatchContiguity GetContiguity() const noexcept;
BatchContiguity GetContiguity() const noexcept {
return state_.Get();
}

/**
* @brief Coalesce from individual samples to a contiguous buffer if the conditions are met.
Expand Down Expand Up @@ -472,7 +483,7 @@ class DLL_PUBLIC TensorList {
/**
* @brief Set the provided buffer as backing memory for this batch.
*/
DLL_PUBLIC void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
DLL_PUBLIC void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorListShape<> &shape, DALIDataType type, int device_id,
AccessOrder order = {}, const TensorLayout &layout = "");

Expand All @@ -483,11 +494,15 @@ class DLL_PUBLIC TensorList {

void set_pinned(bool pinned);

bool is_pinned() const;
bool is_pinned() const {
return pinned_;
}

void set_device_id(int device_id);

int device_id() const;
int device_id() const {
return device_;
}

bool has_data() const;

Expand Down Expand Up @@ -531,7 +546,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the layout of the sample in the batch.
*/
TensorLayout GetLayout() const;
TensorLayout GetLayout() const {
return layout_;
}

/**
* @brief Set cache metadata for given sample
Expand Down Expand Up @@ -817,7 +834,7 @@ class DLL_PUBLIC TensorList {
* Only allowed for contiguous batch, in typical scenario it is equivalent to
* unsafe_sample_owner(batch, 0)
*/
friend shared_ptr<void> unsafe_owner(TensorList<Backend> &batch) {
friend const shared_ptr<void> &unsafe_owner(TensorList<Backend> &batch) {
DALI_ENFORCE(batch.IsContiguous(),
"Data owner pointer can be obtain only for contiguous TensorList.");
return batch.contiguous_buffer_.get_data_ptr();
Expand Down
Loading

0 comments on commit f34a227

Please sign in to comment.