From 45b176716cc667384577a2a1218c6da454854109 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Sat, 3 Aug 2024 09:23:06 +0800 Subject: [PATCH] GH-43414: [C++][Compute] Fix invalid memory access when resizing var-length buffer in row table (#43415) ### Rationale for this change As #43414 explained. The UT in PR reproduces this issue well (may need ASAN to detect). ### What changes are included in this PR? Check if `is_fixed_length` before treating the second buffer as offset. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #43414 Lead-authored-by: Ruoxi Sun Co-authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/row/row_internal.cc | 8 +++- cpp/src/arrow/compute/row/row_internal.h | 7 ++++ cpp/src/arrow/compute/row/row_test.cc | 47 ++++++++++++++--------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 2365ef5632cce..746ed950ffa07 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -293,8 +293,10 @@ Status RowTableImpl::ResizeFixedLengthBuffers(int64_t num_extra_rows) { } Status RowTableImpl::ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes) { + DCHECK(!metadata_.is_fixed_length); + int64_t num_bytes = offsets()[num_rows_]; - if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) { + if (bytes_capacity_ >= num_bytes + num_extra_bytes) { return Status::OK(); } @@ -397,7 +399,9 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, Status RowTableImpl::AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append) { RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append)); - RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); + if (!metadata_.is_fixed_length) { + RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); + } num_rows_ += num_rows_to_append; if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) { memset(rows_->mutable_data(), 0, bytes_capacity_); diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 80409f93d2b96..93818fb14d629 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -220,7 +220,14 @@ class ARROW_EXPORT RowTableImpl { } private: + /// \brief Resize the fixed length buffers to store `num_extra_rows` more rows. The + /// fixed length buffers are buffers_[0] for null masks, buffers_[1] for row data if the + /// row is fixed length, or for row offsets otherwise. Status ResizeFixedLengthBuffers(int64_t num_extra_rows); + + /// \brief Resize the optional varying length buffer to store `num_extra_bytes` more + /// bytes. + /// \pre !metadata_.is_fixed_length Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes); // Helper functions to determine the number of bytes needed for each diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 679ad519a9ef2..75f981fb1281d 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -69,9 +69,14 @@ TEST(RowTableMemoryConsumption, Encode) { constexpr int64_t num_rows_max = 8192; constexpr int64_t padding_for_vectors = 64; - ASSERT_OK_AND_ASSIGN( - auto fixed_length_column, - ::arrow::gen::Constant(std::make_shared(0))->Generate(num_rows_max)); + std::vector> fixed_length_columns; + for (const auto& dt : {int8(), uint16(), int32(), uint64(), fixed_size_binary(16), + fixed_size_binary(32)}) { + ASSERT_OK_AND_ASSIGN(auto fixed_length_column, + ::arrow::gen::Random(dt)->Generate(num_rows_max)); + fixed_length_columns.push_back(std::move(fixed_length_column)); + } + ASSERT_OK_AND_ASSIGN(auto var_length_column, ::arrow::gen::Constant(std::make_shared("X")) ->Generate(num_rows_max)); @@ -81,22 +86,26 @@ TEST(RowTableMemoryConsumption, Encode) { { SCOPED_TRACE("encoding fixed length column of " + std::to_string(num_rows) + " rows"); - ASSERT_OK_AND_ASSIGN(auto row_table, - MakeRowTableFromColumn(fixed_length_column, num_rows, - uint32()->byte_width(), 0)); - ASSERT_NE(row_table.data(0), NULLPTR); - ASSERT_NE(row_table.data(1), NULLPTR); - ASSERT_EQ(row_table.data(2), NULLPTR); - - int64_t actual_null_mask_size = - num_rows * row_table.metadata().null_masks_bytes_per_row; - ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); - ASSERT_GT(actual_null_mask_size * 2, - row_table.buffer_size(0) - padding_for_vectors); - - int64_t actual_rows_size = num_rows * uint32()->byte_width(); - ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors); - ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors); + for (const auto& col : fixed_length_columns) { + const auto& dt = col->type(); + SCOPED_TRACE("encoding fixed length column of type " + dt->ToString()); + ASSERT_OK_AND_ASSIGN(auto row_table, + MakeRowTableFromColumn(col, num_rows, dt->byte_width(), + /*string_alignment=*/0)); + ASSERT_NE(row_table.data(0), NULLPTR); + ASSERT_NE(row_table.data(1), NULLPTR); + ASSERT_EQ(row_table.data(2), NULLPTR); + + int64_t actual_null_mask_size = + num_rows * row_table.metadata().null_masks_bytes_per_row; + ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); + ASSERT_GT(actual_null_mask_size * 2, + row_table.buffer_size(0) - padding_for_vectors); + + int64_t actual_rows_size = num_rows * dt->byte_width(); + ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors); + ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors); + } } // Var length column.