Skip to content

Commit

Permalink
Compute data buffer length by using start and end values in offset bu…
Browse files Browse the repository at this point in the history
…ffer (#5741)

* Compute data buffer length by offset buffer start and end values

* Update code comment

* Add unit test

* Add round_trip check

* Fix clippy
  • Loading branch information
viirya authored May 12, 2024
1 parent 1c86921 commit cd39b8c
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 16 deletions.
50 changes: 44 additions & 6 deletions arrow-array/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,24 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
(length + 1) * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
// the len of the data buffer (buffer 2) equals the difference between the last value
// and the first value of the offset buffer (buffer 1).
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i32;
// get first offset
let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize;
end - start
}
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
// the len of the data buffer (buffer 2) equals the difference between the last value
// and the first value of the offset buffer (buffer 1).
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i64;
// get first offset
let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize;
end - start
}
// buffer len of primitive types
_ => {
Expand Down Expand Up @@ -1216,7 +1224,7 @@ mod tests_to_then_from_ffi {
mod tests_from_ffi {
use std::sync::Arc;

use arrow_buffer::{bit_util, buffer::Buffer};
use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
use arrow_data::ArrayData;
use arrow_schema::{DataType, Field};

Expand All @@ -1228,7 +1236,7 @@ mod tests_from_ffi {
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
};

use super::Result;
use super::{ImportedArrowArray, Result};

fn test_round_trip(expected: &ArrayData) -> Result<()> {
// here we export the array
Expand Down Expand Up @@ -1420,4 +1428,34 @@ mod tests_from_ffi {
let data = array.into_data();
test_round_trip(&data)
}

#[test]
fn test_empty_string_with_non_zero_offset() -> Result<()> {
// Simulate an empty string array with a non-zero offset from a producer
let data: Buffer = MutableBuffer::new(0).into();
let offsets = OffsetBuffer::new(vec![123].into());
let string_array =
unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), None) };

let data = string_array.into_data();

let array = FFI_ArrowArray::new(&data);
let schema = FFI_ArrowSchema::try_from(data.data_type())?;

let dt = DataType::try_from(&schema)?;
let array = Arc::new(array);
let imported_array = ImportedArrowArray {
array: &array,
data_type: dt,
owner: &array,
};

let offset_buf_len = imported_array.buffer_len(1, &imported_array.data_type)?;
let data_buf_len = imported_array.buffer_len(2, &imported_array.data_type)?;

assert_eq!(offset_buf_len, 4);
assert_eq!(data_buf_len, 0);

test_round_trip(&imported_array.consume()?)
}
}
21 changes: 11 additions & 10 deletions arrow-data/src/equal/variable_size.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@ fn offset_value_equal<T: ArrowNativeType + Integer>(
) -> bool {
let lhs_start = lhs_offsets[lhs_pos].as_usize();
let rhs_start = rhs_offsets[rhs_pos].as_usize();
let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos];
let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos];
let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos])
.to_usize()
.unwrap();
let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos])
.to_usize()
.unwrap();

lhs_len == rhs_len
&& equal_len(
lhs_values,
rhs_values,
lhs_start,
rhs_start,
lhs_len.to_usize().unwrap(),
)
if lhs_len == 0 && rhs_len == 0 {
return true;
}

lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start, rhs_start, lhs_len)
}

pub(super) fn variable_sized_equal<T: ArrowNativeType + Integer>(
Expand Down

0 comments on commit cd39b8c

Please sign in to comment.