From 6c59b7637592e4b67b18762b8313f91086c0d5d8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Aug 2024 14:46:31 -0400 Subject: [PATCH] Minor: `pub use ByteView` in arrow and improve documentation (#6275) * Minor: `pub use ByteView` in arrow and improve documentation * clarify docs more --- arrow-array/src/array/byte_view_array.rs | 23 +++++++++++++++-------- arrow-data/src/byte_view.rs | 7 +++++++ arrow/src/array/mod.rs | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 42f945838a45..a155b6ab22e2 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -52,7 +52,7 @@ use super::ByteArrayType; /// not by value. as there are many different buffer layouts to represent the /// same data (e.g. different offsets, different buffer sizes, etc). /// -/// # Layout +/// # Layout: "views" and buffers /// /// A `GenericByteViewArray` stores variable length byte strings. An array of /// `N` elements is stored as `N` fixed length "views" and a variable number @@ -75,10 +75,12 @@ use super::ByteArrayType; /// 0 31 63 95 127 /// ``` /// -/// * Strings with length <= 12 are stored directly in the view. +/// * Strings with length <= 12 are stored directly in the view. See +/// [`Self::inline_value`] to access the inlined prefix from a short view. /// /// * Strings with length > 12: The first four bytes are stored inline in the -/// view and the entire string is stored in one of the buffers. +/// view and the entire string is stored in one of the buffers. See [`ByteView`] +/// to access the fields of the these views. /// /// Unlike [`GenericByteArray`], there are no constraints on the offsets other /// than they must point into a valid buffer. However, they can be out of order, @@ -89,6 +91,8 @@ use super::ByteArrayType; /// separate buffer while the string "LavaMonster" is stored inlined in the /// view. In this case, the same bytes for "Fish" are used to store both strings. /// +/// [`ByteView`]: arrow_data::ByteView +/// /// ```text /// ┌───┐ /// ┌──────┬──────┬──────┬──────┐ offset │...│ @@ -261,9 +265,12 @@ impl GenericByteViewArray { unsafe { self.value_unchecked(i) } } - /// Returns the element at index `i` + /// Returns the element at index `i` without bounds checking + /// /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array + /// + /// Caller is responsible for ensuring that the index is within the bounds + /// of the array pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { let v = self.views.get_unchecked(idx); let len = *v as u32; @@ -278,7 +285,7 @@ impl GenericByteViewArray { T::Native::from_bytes_unchecked(b) } - /// Returns the inline value of the view. + /// Returns the first `len` bytes the inline value of the view. /// /// # Safety /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout. @@ -289,7 +296,7 @@ impl GenericByteViewArray { std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len) } - /// constructs a new iterator + /// Constructs a new iterator for iterating over the values of this array pub fn iter(&self) -> ArrayIter<&Self> { ArrayIter::new(self) } @@ -358,7 +365,7 @@ impl GenericByteViewArray { builder.finish() } - /// Comparing two [`GenericByteViewArray`] at index `left_idx` and `right_idx` + /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx` /// /// Comparing two ByteView types are non-trivial. /// It takes a bit of patience to understand why we don't just compare two &[u8] directly. diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index b8b1731ac60b..a2e9d135fdcb 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -18,6 +18,13 @@ use arrow_buffer::Buffer; use arrow_schema::ArrowError; +/// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and +/// `BinaryViewArray`) where the length is greater than 12 bytes. +/// +/// See the documentation on [`GenericByteViewArray`] for more information on +/// the layout of the views. +/// +/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html #[derive(Debug, Copy, Clone, Default)] #[repr(C)] pub struct ByteView { diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 242c9148cac4..410e9d5af2a6 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -25,7 +25,7 @@ pub use arrow_array::cast::*; pub use arrow_array::iterator::*; pub use arrow_array::*; pub use arrow_data::{ - layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, DataTypeLayout, + layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, ByteView, DataTypeLayout, }; pub use arrow_data::transform::{Capacities, MutableArrayData};