From a32ee761a4e61ee2986075400ea98cf6567ec2aa Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 24 May 2024 09:08:07 -0400 Subject: [PATCH] Refine documentation for unary_mut and binary_mut, --- arrow-arith/src/arity.rs | 2 +- arrow-array/src/array/primitive_array.rs | 123 ++++++++++++++++------- arrow-array/src/types.rs | 6 +- arrow-buffer/src/native.rs | 11 +- 4 files changed, 100 insertions(+), 42 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index ff5c8e822cc0..b5572bc8d4b4 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines kernels suitable to perform operations to primitive arrays. +//! Kernels for operating on [`PrimitiveArray`]s use arrow_array::builder::BufferBuilder; use arrow_array::types::ArrowDictionaryKeyType; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 919a1010116b..11a58327d8a9 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -419,7 +419,7 @@ pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; -/// An array of [primitive values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) +/// An array of primtive values, of type [`ArrowPrimitiveType`] /// /// # Example: From a Vec /// @@ -480,6 +480,19 @@ pub use crate::types::ArrowPrimitiveType; /// assert_eq!(array.values(), &[1, 0, 2]); /// assert!(array.is_null(1)); /// ``` +/// +/// # Example: Get a `PrimitiveArray` from an [`ArrayRef`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{Array, ArrayRef, Float32Array, PrimitiveArray}; +/// # use arrow_array::types::{Float32Type}; +/// # use arrow_schema::DataType; +/// # let array: ArrayRef = Arc::new(Float32Array::from(vec![1.2, 2.3])); +/// // will panic if the array is not a Float32Array +/// assert_eq!(&DataType::Float32, array.data_type()); +/// let f32_array = PrimitiveArray::::from(array.into_data()); +/// assert_eq!(f32_array, Float32Array::from(vec![1.2, 2.3])); +/// ``` pub struct PrimitiveArray { data_type: DataType, /// Values data @@ -732,22 +745,30 @@ impl PrimitiveArray { PrimitiveArray::from(unsafe { d.build_unchecked() }) } - /// Applies an unary and infallible function to a primitive array. - /// This is the fastest way to perform an operation on a primitive array when - /// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. + /// Applies a unary infallible function to a primitive array, producing a + /// new array of potentially different type. + /// + /// This is the fastest way to perform an operation on a primitive array + /// when the benefits of a vectorized operation outweigh the cost of + /// branching nulls and non-nulls. /// - /// # Implementation + /// See [`Self::unary_mut`] for in place modification. + /// + /// # Null Handling + /// + /// Applies the function for all values, including those on null slots. This + /// will often allow the compiler to generate faster vectorized code, but + /// requires that the operation must be infallible (not error/panic) for any + /// value of the corresponding type or this function may panic. /// - /// This will apply the function for all values, including those on null slots. - /// This implies that the operation must be infallible for any value of the corresponding type - /// or this function may panic. /// # Example /// ```rust - /// # use arrow_array::{Int32Array, types::Int32Type}; + /// # use arrow_array::{Int32Array, Float32Array, types::Int32Type}; /// # fn main() { /// let array = Int32Array::from(vec![Some(5), Some(7), None]); - /// let c = array.unary(|x| x * 2 + 1); - /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// // Create a new array with the value of applying sqrt + /// let c = array.unary(|x| f32::sqrt(x as f32)); + /// assert_eq!(c, Float32Array::from(vec![Some(2.236068), Some(2.6457512), None])); /// # } /// ``` pub fn unary(&self, op: F) -> PrimitiveArray @@ -766,23 +787,43 @@ impl PrimitiveArray { PrimitiveArray::new(buffer.into(), nulls) } - /// Applies an unary and infallible function to a mutable primitive array. - /// Mutable primitive array means that the buffer is not shared with other arrays. - /// As a result, this mutates the buffer directly without allocating new buffer. + /// Applies a unary and infallible function to the array in place if possible. + /// + /// # Buffer Reuse + /// + /// If the underlying buffers are not shared with other arrays, mutates the + /// underlying buffer in place, without allocating a new buffer. + /// + /// # Null Handling /// - /// # Implementation + /// See [`Self::unary`] for more information on null handling. /// - /// This will apply the function for all values, including those on null slots. - /// This implies that the operation must be infallible for any value of the corresponding type - /// or this function may panic. /// # Example + /// /// ```rust /// # use arrow_array::{Int32Array, types::Int32Type}; - /// # fn main() { /// let array = Int32Array::from(vec![Some(5), Some(7), None]); + /// // Apply x*2+1 to the data in place, no allocations + /// let c = array.unary_mut(|x| x * 2 + 1).unwrap(); + /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// ``` + /// + /// # Example: modify [`ArrayRef`] in place, if not shared + /// + /// It is also possible to modify an [`ArrayRef`] if there are no other + /// references to the underlying buffer. + /// + /// ```rust + /// # use std::sync::Arc; + /// # use arrow_array::{Array, ArrayRef, Int32Array, PrimitiveArray, types::Int32Type}; + /// # let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(5), Some(7), None])); + /// // Convert to Int32Array (panic's if array.data_type is not Int32) + /// let array = PrimitiveArray::::from(array.into_data()); + /// // Apply x*2+1 to the data in place, no allocations if + /// // there are no other references to the underlying buffer + /// // will create a new buffer if there are references. /// let c = array.unary_mut(|x| x * 2 + 1).unwrap(); /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); - /// # } /// ``` pub fn unary_mut(self, op: F) -> Result, PrimitiveArray> where @@ -796,11 +837,12 @@ impl PrimitiveArray { Ok(builder.finish()) } - /// Applies a unary and fallible function to all valid values in a primitive array + /// Applies a unary fallible function to all valid values in a primitive + /// array, producing a new array of potentially different type. /// - /// This is unlike [`Self::unary`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// Applies `op` to only rows that are valid, which is often significantly + /// slower than [`Self::unary`], which should be preferred if `op` is + /// fallible. /// /// Note: LLVM is currently unable to effectively vectorize fallible operations pub fn try_unary(&self, op: F) -> Result, E> @@ -829,13 +871,16 @@ impl PrimitiveArray { Ok(PrimitiveArray::new(values, nulls)) } - /// Applies an unary and fallible function to all valid values in a mutable primitive array. - /// Mutable primitive array means that the buffer is not shared with other arrays. - /// As a result, this mutates the buffer directly without allocating new buffer. + /// Applies a unary fallible function to all valid values in a mutable + /// primitive array. /// - /// This is unlike [`Self::unary_mut`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// # Null Handling + /// + /// See [`Self::try_unary`] for more information on null handling. + /// + /// # Buffer Reuse + /// + /// See [`Self::unary_mut`] for more information on buffer reuse. /// /// This returns an `Err` when the input array is shared buffer with other /// array. In the case, returned `Err` wraps input array. If the function @@ -870,9 +915,9 @@ impl PrimitiveArray { /// Applies a unary and nullable function to all valid values in a primitive array /// - /// This is unlike [`Self::unary`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// Applies `op` to only rows that are valid, which is often significantly + /// slower than [`Self::unary`], which should be preferred if `op` is + /// fallible. /// /// Note: LLVM is currently unable to effectively vectorize fallible operations pub fn unary_opt(&self, op: F) -> PrimitiveArray @@ -915,8 +960,16 @@ impl PrimitiveArray { PrimitiveArray::new(values, Some(nulls)) } - /// Returns `PrimitiveBuilder` of this primitive array for mutating its values if the underlying - /// data buffer is not shared by others. + /// Returns a `PrimitiveBuilder` for this array, suitable for mutating values + /// in place. + /// + /// # Buffer Reuse + /// + /// If the underlying data buffer has no other outstanding references, the + /// buffer is used without copying. + /// + /// If the underlying data buffer does have outstanding references, the + /// buffer is cloned. pub fn into_builder(self) -> Result, Self> { let len = self.len(); let data = self.into_data(); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 198a11cb6974..a790fba86fed 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -47,9 +47,11 @@ impl BooleanType { pub const DATA_TYPE: DataType = DataType::Boolean; } -/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the -/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. +/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow +/// (via [`DataType`]) with the static-typed nature of rust types +/// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. /// +/// [primitive values]: https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout /// [`ArrowNativeType`]: arrow_buffer::ArrowNativeType pub trait ArrowPrimitiveType: primitive::PrimitiveTypeSealed + 'static { /// Corresponding Rust native type for the primitive type. diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index e05c1311ff3c..c563f73cf5b9 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -22,11 +22,14 @@ mod private { pub trait Sealed {} } -/// Trait expressing a Rust type that has the same in-memory representation -/// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). +/// Trait expressing a Rust type that has the same in-memory representation as +/// Arrow. /// -/// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers -/// as is. +/// This includes `i16`, `f32`, but excludes `bool` (which in arrow is +/// represented in bits). +/// +/// In little endian machines, types that implement [`ArrowNativeType`] can be +/// memcopied to arrow buffers as is. /// /// # Transmute Safety ///